Python源码示例:allennlp.data.vocabulary.Vocabulary()

示例1
def __init__(self,
                 vocabulary: Vocabulary,
                 namespace: str = "intent_labels",
                 ignore_classes: List[str] = None,
                 coarse: bool = True) -> None:
        """
        Parameters
        ----------
        vocabulary : ``Vocabulary``, required.
            A vocabulary containing the label namespace.
        namespace : str, required.
            The vocabulary namespace for labels.
        ignore_classes : List[str], optional.
            Labels which will be ignored when computing metrics.
        """
        self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(namespace)
        self._ignore_classes: List[str] = ignore_classes or []
        self._coarse = coarse

        # These will hold per label span counts.
        self._true_positives: Dict[str, int] = defaultdict(int)
        self._false_positives: Dict[str, int] = defaultdict(int)
        self._false_negatives: Dict[str, int] = defaultdict(int) 
示例2
def __init__(self,
                 embedder: TextFieldEmbedder,
                 vocab: Vocabulary,
                 lm_head: LanguageModelHead=None,
                 clf_head: ClassificationHead=None,
                 language_model_weight: float=.5) -> None:
        
        super().__init__(vocab)
        
        assert not (lm_head is None and clf_head is None)
        
        self.embedder = embedder
        self.clf_head = clf_head
        self.lm_head = lm_head
        self.language_model_weight = language_model_weight
        self.vocab = vocab 
示例3
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])

        # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
        # That results in the embedding for the token to be all zeros.
        offsets = [x if x is not None else (-1, -1) for x in offsets]

        output: IndexedTokenList = {
            "token_ids": [t.text_id for t in wordpieces],
            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": [t.type_id for t in wordpieces],
            "offsets": offsets,
            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output) 
示例4
def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
        """
        Copies tokens from ```transformers``` model's vocab to the specified namespace.
        """
        if self._added_to_vocabulary:
            return

        try:
            vocab_items = self._tokenizer.get_vocab().items()
        except NotImplementedError:
            vocab_items = (
                (self._tokenizer.convert_ids_to_tokens(idx), idx)
                for idx in range(self._tokenizer.vocab_size)
            )
        for word, idx in vocab_items:
            vocab._token_to_index[self._namespace][word] = idx
            vocab._index_to_token[self._namespace][idx] = word

        self._added_to_vocabulary = True 
示例5
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[int]]:
        indices: List[int] = []

        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            text = self._get_feature_value(token)
            if self.namespace is None:
                # We could have a check here that `text` is an int; not sure it's worth it.
                indices.append(text)  # type: ignore
            else:
                if self.lowercase_tokens:
                    text = text.lower()
                indices.append(vocabulary.get_token_index(text, self.namespace))

        return {"tokens": indices} 
示例6
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[List[int]]]:
        indices: List[List[int]] = []
        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            token_indices: List[int] = []
            if token.text is None:
                raise ConfigurationError(
                    "TokenCharactersIndexer needs a tokenizer that retains text"
                )
            for character in self._character_tokenizer.tokenize(token.text):
                if getattr(character, "text_id", None) is not None:
                    # `text_id` being set on the token means that we aren't using the vocab, we just
                    # use this id instead.
                    index = character.text_id
                else:
                    index = vocabulary.get_token_index(character.text, self._namespace)
                token_indices.append(index)
            indices.append(token_indices)
        return {"token_characters": indices} 
示例7
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
        """
        If there are strings in this field that need to be converted into integers through a
        :class:`Vocabulary`, here is where we count them, to determine which tokens are in or out
        of the vocabulary.

        If your `Field` does not have any strings that need to be converted into indices, you do
        not need to implement this method.

        A note on this `counter`: because `Fields` can represent conceptually different things,
        we separate the vocabulary items by `namespaces`.  This way, we can use a single shared
        mechanism to handle all mappings from strings to integers in all fields, while keeping
        words in a `TextField` from sharing the same ids with labels in a `LabelField` (e.g.,
        "entailment" or "contradiction" are labels in an entailment task)

        Additionally, a single `Field` might want to use multiple namespaces - `TextFields` can
        be represented as a combination of word ids and character ids, and you don't want words and
        characters to share the same vocabulary - "a" as a word should get a different id from "a"
        as a character, and the vocabulary sizes of words and characters are very different.

        Because of this, the first key in the `counter` object is a `namespace`, like "tokens",
        "token_characters", "tags", or "labels", and the second key is the actual vocabulary item.
        """
        pass 
示例8
def test_multilabel_field_empty_field_works(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
        vocab.add_token_to_namespace("label2", namespace="test_empty_labels")

        f = MultiLabelField([], label_namespace="test_empty_labels")
        f.index(vocab)
        tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
        g = f.empty_field()
        g.index(vocab)
        tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))

        h = MultiLabelField(
            [0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True
        )
        tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
        numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0])) 
示例9
def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 vocab: Vocabulary,
                 lstm_hidden_dim: int,
                 top_k: int,
                 cuda_device: int) -> None:
        super().__init__(vocab)

        self.word_embeddings = word_embeddings

        self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)
        self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)

        # this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights) 
        self.cosine_module = CosineMatrixAttention()

        self.top_k = top_k

        self.dense = nn.Linear(top_k, out_features=20, bias=True)
        self.dense2 = nn.Linear(20, out_features=20, bias=True)
        self.dense3 = nn.Linear(20, out_features=1, bias=False) 
示例10
def __init__(self,
                 vocab: Vocabulary,
                 word_embedder: TextFieldEmbedder,
                 character_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 character_encoder: Seq2VecEncoder) -> None:
        super().__init__(vocab)

        self._word_embedder = word_embedder
        self._character_embedder = character_embedder
        self._character_encoder = character_encoder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels') 
示例11
def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )
        self._crf = ConditionalRandomField(
            vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels') 
示例12
def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
        # pylint: disable=protected-access
        for word, idx in self.encoder.items():
            vocabulary._token_to_index[self._namespace][word] = idx
            vocabulary._index_to_token[self._namespace][idx] = word 
示例13
def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        if not self._added_to_vocabulary:
            self._add_encoding_to_vocabulary(vocabulary)
            self._added_to_vocabulary = True

        text_tokens = []
        offsets = []
        offset = -1

        for token in tokens:
            bpe_tokens = [self.encoder.get(t, 0) for t in self.byte_pair_encode(token) if self.encoder.get(t, 0) != 0]

            if bpe_tokens:
                offset += len(bpe_tokens)
                offsets.append(offset)
                text_tokens.extend(bpe_tokens)

        num_tokens = len(text_tokens)

        # If there's too many tokens, that's going to cause problems.
        if num_tokens >= self.n_ctx:
            print('Sequence to long. Pruning!')
            text_tokens = text_tokens[:self.n_ctx]
            text_tokens[-2] = self.encoder['__clf__</w>']
            text_tokens[-1] = 0
        else:
            text_tokens.append(0)

        return {
                index_name: text_tokens,
                f"{index_name}-offsets": offsets,
                # add mask here according to the original tokens,
                # because calling util.get_text_field_mask on the
                # "byte pair" tokens will produce the wrong shape
                "mask": [1 for _ in offsets]
        } 
示例14
def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
        # pylint: disable=protected-access
        for word, idx in self.vocab.items():
            vocabulary._token_to_index[self._namespace][word] = idx
            vocabulary._index_to_token[self._namespace][idx] = word 
示例15
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._add_encoding_to_vocabulary_if_needed(vocabulary)

        indices, type_ids = self._extract_token_and_type_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        output: IndexedTokenList = {
            "token_ids": indices,
            "mask": [True] * len(indices),
            "type_ids": type_ids,
        }

        return self._postprocess_output(output) 
示例16
def indices_to_tokens(
        self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary
    ) -> List[Token]:
        token_ids = indexed_tokens["token_ids"]
        type_ids = indexed_tokens.get("type_ids")

        return [
            Token(
                text=vocabulary.get_token_from_index(token_ids[i], self._namespace),
                text_id=token_ids[i],
                type_id=type_ids[i] if type_ids is not None else None,
            )
            for i in range(len(token_ids))
        ] 
示例17
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        """
        Takes a list of tokens and converts them to an `IndexedTokenList`.
        This could be just an ID for each token from the vocabulary.
        Or it could split each token into characters and return one ID per character.
        Or (for instance, in the case of byte-pair encoding) there might not be a clean
        mapping from individual tokens to indices, and the `IndexedTokenList` could be a complex
        data structure.
        """
        raise NotImplementedError 
示例18
def indices_to_tokens(
        self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary
    ) -> List[Token]:
        """
        Inverse operations of tokens_to_indices. Takes an `IndexedTokenList` and converts it back
        into a list of tokens.
        """
        raise NotImplementedError 
示例19
def index_instances(self, vocab: Vocabulary) -> None:
        for instance in self.instances:
            instance.index_fields(vocab) 
示例20
def __init__(self, instances: List[Instance], vocab: Vocabulary = None):
        self.instances = instances
        self.vocab = vocab 
示例21
def __init__(
        self,
        instance_generator: Callable[[str], Iterable[Instance]],
        file_path: str,
        vocab: Vocabulary = None,
    ) -> None:
        super().__init__()
        self._instance_generator = instance_generator
        self._file_path = file_path
        self.vocab = vocab 
示例22
def index_with(self, vocab: Vocabulary):
        self.vocab = vocab 
示例23
def add_field(self, field_name: str, field: Field, vocab: Vocabulary = None) -> None:
        """
        Add the field to the existing fields mapping.
        If we have already indexed the Instance, then we also index `field`, so
        it is necessary to supply the vocab.
        """
        self.fields[field_name] = field
        if self.indexed:
            field.index(vocab) 
示例24
def index_fields(self, vocab: Vocabulary) -> None:
        """
        Indexes all fields in this `Instance` using the provided `Vocabulary`.
        This `mutates` the current object, it does not return a new `Instance`.
        A `DataLoader` will call this on each pass through a dataset; we use the `indexed`
        flag to make sure that indexing only happens once.

        This means that if for some reason you modify your vocabulary after you've
        indexed your instances, you might get unexpected behavior.
        """
        if not self.indexed:
            self.indexed = True
            for field in self.fields.values():
                field.index(vocab) 
示例25
def index(self, vocab: Vocabulary):
        self._mapping_array = [
            vocab.get_token_index(x.text, self._target_namespace) for x in self._source_tokens
        ] 
示例26
def index(self, vocab: Vocabulary):
        for field in self.field_list:
            field.index(vocab) 
示例27
def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
        if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning(
                    "Your label namespace was '%s'. We recommend you use a namespace "
                    "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                    "default to your vocabulary.  See documentation for "
                    "`non_padded_namespaces` parameter in Vocabulary.",
                    self._label_namespace,
                )
                self._already_warned_namespaces.add(label_namespace) 
示例28
def index(self, vocab: Vocabulary):
        if not self._skip_indexing:
            self._label_id = vocab.get_token_index(
                self.label, self._label_namespace  # type: ignore
            ) 
示例29
def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
        if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")):
            if label_namespace not in self._already_warned_namespaces:
                logger.warning(
                    "Your label namespace was '%s'. We recommend you use a namespace "
                    "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
                    "default to your vocabulary.  See documentation for "
                    "`non_padded_namespaces` parameter in Vocabulary.",
                    self._label_namespace,
                )
                self._already_warned_namespaces.add(label_namespace) 
示例30
def index(self, vocab: Vocabulary):
        if self.labels is not None:
            self._indexed_labels = [
                vocab.get_token_index(label, self._label_namespace) for label in self.labels
            ]