Python源码示例:allennlp.data.tokenizers.token.Token()

示例1
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields) 
示例2
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)

        wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])

        # For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
        # That results in the embedding for the token to be all zeros.
        offsets = [x if x is not None else (-1, -1) for x in offsets]

        output: IndexedTokenList = {
            "token_ids": [t.text_id for t in wordpieces],
            "mask": [True] * len(tokens),  # for original tokens (i.e. word-level)
            "type_ids": [t.type_id for t in wordpieces],
            "offsets": offsets,
            "wordpiece_mask": [True] * len(wordpieces),  # for wordpieces (i.e. subword-level)
        }

        return self._matched_indexer._postprocess_output(output) 
示例3
def __init__(
        self,
        namespace: Optional[str] = "tokens",
        lowercase_tokens: bool = False,
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        feature_name: str = "text",
        default_value: str = _DEFAULT_VALUE,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        self.namespace = namespace
        self.lowercase_tokens = lowercase_tokens

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])]
        self._feature_name = feature_name
        self._default_value = default_value 
示例4
def __init__(
        self,
        namespace: str = "token_characters",
        character_tokenizer: CharacterTokenizer = CharacterTokenizer(),
        start_tokens: List[str] = None,
        end_tokens: List[str] = None,
        min_padding_length: int = 0,
        token_min_padding_length: int = 0,
    ) -> None:
        super().__init__(token_min_padding_length)
        if min_padding_length == 0:
            url = "https://github.com/allenai/allennlp/issues/1954"
            warnings.warn(
                "You are using the default value (0) of `min_padding_length`, "
                f"which can cause some subtle bugs (more info see {url}). "
                "Strongly recommend to set a value, usually the maximum size "
                "of the convolutional layer size when using CnnEncoder.",
                UserWarning,
            )
        self._min_padding_length = min_padding_length
        self._namespace = namespace
        self._character_tokenizer = character_tokenizer

        self._start_tokens = [Token(st) for st in (start_tokens or [])]
        self._end_tokens = [Token(et) for et in (end_tokens or [])] 
示例5
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[List[int]]]:
        indices: List[List[int]] = []
        for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
            token_indices: List[int] = []
            if token.text is None:
                raise ConfigurationError(
                    "TokenCharactersIndexer needs a tokenizer that retains text"
                )
            for character in self._character_tokenizer.tokenize(token.text):
                if getattr(character, "text_id", None) is not None:
                    # `text_id` being set on the token means that we aren't using the vocab, we just
                    # use this id instead.
                    index = character.text_id
                else:
                    index = vocabulary.get_token_index(character.text, self._namespace)
                token_indices.append(index)
            indices.append(token_indices)
        return {"token_characters": indices} 
示例6
def tokenize(self, text: str) -> List[Token]:
        if self._lowercase_characters:
            text = text.lower()
        if self._byte_encoding is not None:
            # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
            # of this.
            tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
        else:
            tokens = [Token(t) for t in list(text)]
        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)
        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)
        return tokens 
示例7
def _intra_word_tokenize(
        self, string_tokens: List[str]
    ) -> Tuple[List[Token], List[Optional[Tuple[int, int]]]]:
        tokens: List[Token] = []
        offsets: List[Optional[Tuple[int, int]]] = []
        for token_string in string_tokens:
            wordpieces = self.tokenizer.encode_plus(
                token_string,
                add_special_tokens=False,
                return_tensors=None,
                return_offsets_mapping=False,
                return_attention_mask=False,
                return_token_type_ids=False,
            )
            wp_ids = wordpieces["input_ids"]

            if len(wp_ids) > 0:
                offsets.append((len(tokens), len(tokens) + len(wp_ids) - 1))
                tokens.extend(
                    Token(text=wp_text, text_id=wp_id)
                    for wp_id, wp_text in zip(wp_ids, self.tokenizer.convert_ids_to_tokens(wp_ids))
                )
            else:
                offsets.append(None)
        return tokens, offsets 
示例8
def intra_word_tokenize_sentence_pair(
        self, string_tokens_a: List[str], string_tokens_b: List[str]
    ) -> Tuple[List[Token], List[Tuple[int, int]], List[Tuple[int, int]]]:
        """
        Tokenizes each word into wordpieces separately and returns the wordpiece IDs.
        Also calculates offsets such that wordpieces[offsets[i][0]:offsets[i][1] + 1]
        corresponds to the original i-th token.

        This function inserts special tokens.
        """
        tokens_a, offsets_a = self._intra_word_tokenize(string_tokens_a)
        tokens_b, offsets_b = self._intra_word_tokenize(string_tokens_b)
        offsets_b = self._increment_offsets(
            offsets_b,
            (
                len(self.sequence_pair_start_tokens)
                + len(tokens_a)
                + len(self.sequence_pair_mid_tokens)
            ),
        )
        tokens_a = self.add_special_tokens(tokens_a, tokens_b)
        offsets_a = self._increment_offsets(offsets_a, len(self.sequence_pair_start_tokens))

        return tokens_a, offsets_a, offsets_b 
示例9
def add_special_tokens(
        self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
    ) -> List[Token]:
        # Make sure we don't change the input parameters
        tokens1 = copy.deepcopy(tokens1)
        tokens2 = copy.deepcopy(tokens2)

        # We add special tokens and also set token type ids.
        if tokens2 is None:
            for token in tokens1:
                token.type_id = self.single_sequence_token_type_id
            return self.single_sequence_start_tokens + tokens1 + self.single_sequence_end_tokens
        else:
            for token in tokens1:
                token.type_id = self.sequence_pair_first_token_type_id
            for token in tokens2:
                token.type_id = self.sequence_pair_second_token_type_id
            return (
                self.sequence_pair_start_tokens
                + tokens1
                + self.sequence_pair_mid_tokens
                + tokens2
                + self.sequence_pair_end_tokens
            ) 
示例10
def add_special_tokens(
        self, tokens1: List[Token], tokens2: Optional[List[Token]] = None
    ) -> List[Token]:
        """
        Adds special tokens to tokenized text. These are tokens like [CLS] or [SEP].

        Not all tokenizers do this. The default is to just return the tokens unchanged.

        # Parameters

        tokens1 : `List[Token]`
            The list of tokens to add special tokens to.
        tokens2 : `Optional[List[Token]]`
            An optional second list of tokens. This will be concatenated with `tokens1`. Special tokens will be
            added as appropriate.

        # Returns
        tokens : `List[Token]`
            The combined list of tokens, with special tokens added.
        """
        return tokens1 + (tokens2 or []) 
示例11
def get_padding_lengths(self) -> Dict[str, int]:
        """
        The `TextField` has a list of `Tokens`, and each `Token` gets converted into arrays by
        (potentially) several `TokenIndexers`.  This method gets the max length (over tokens)
        associated with each of these arrays.
        """
        if self._indexed_tokens is None:
            raise ConfigurationError(
                "You must call .index(vocabulary) on a field before determining padding lengths."
            )

        padding_lengths = {}
        for indexer_name, indexer in self._token_indexers.items():
            indexer_lengths = indexer.get_padding_lengths(self._indexed_tokens[indexer_name])
            for key, length in indexer_lengths.items():
                padding_lengths[f"{indexer_name}___{key}"] = length
        return padding_lengths 
示例12
def tokenize(self, text     )               :
        if self._lowercase_characters:
            text = text.lower()
        if self._byte_encoding is not None:
            # We add 1 here so that we can still use 0 for masking, no matter what bytes we get out
            # of this.
            tokens = [Token(text_id=c + 1) for c in text.encode(self._byte_encoding)]
        else:
            tokens = [Token(t) for t in list(text)]
        for start_token in self._start_tokens:
            if isinstance(start_token, int):
                token = Token(text_id=start_token, idx=0)
            else:
                token = Token(text=start_token, idx=0)
            tokens.insert(0, token)
        for end_token in self._end_tokens:
            if isinstance(end_token, int):
                token = Token(text_id=end_token, idx=0)
            else:
                token = Token(text=end_token, idx=0)
            tokens.append(token)
        return tokens 
示例13
def text_to_instance(self,  # type: ignore
                         sentence_tokens: List[str],
                         verb_vector: List[int],
                         entity_vector: List[int],
                         state_change_types: Optional[List[str]] = None,
                         state_change_tags: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # encode inputs
        token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
        fields['tokens'] = token_field
        fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
        fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        if state_change_tags:
            fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')

        return Instance(fields) 
示例14
def test_find_span(self):
        sentence = [Token("My"), Token("car"), Token("is"), Token("-"), Token("grey"), Token("?")]

        # Single token
        assert _find_span([Token("car")], sentence) == (1, 1)

        # Multi token
        assert _find_span([Token("My"), Token("car")], sentence) == (0, 1)

        # Case insensitive
        assert _find_span([Token("my"), Token("car")], sentence) == (0, 1)

        # Not in sentence
        assert _find_span([Token("my"), Token("truck")], sentence) == (-1, -1)

        # Unknown
        assert _find_span([Token("?")], sentence) == (-2, -2)

        # Absent
        assert _find_span([Token("-")], sentence) == (-3, -3) 
示例15
def _number_token_match(self,
                            entity: str,
                            entity_text: List[Token],
                            token: Token,
                            token_index: int,
                            tokens: List[Token]) -> float:
        # PNP had a "spanFeatures" function that said whether an entity was a-priori known to link
        # to a token or set of tokens in the question.  This was only used for numbers, and it's
        # not totally clear to me how this number feature overlapped with the token match features
        # in the original implementation (I think in most cases it was the same, except for things
        # like "four million", because the token match is derived from the entity name, which would
        # be 4000000, and wouldn't match "four million").
        #
        # Our implementation basically just adds a duplicate token match feature that's specific to
        # numbers.  It'll break in some rare cases (e.g., "Which four had four million ..."), but
        # those shouldn't be a big deal.
        if entity.startswith('fb:'):
            # This check works because numbers are the only entities that don't start with "fb:".
            return 0.0
        return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens) 
示例16
def _span_overlap_fraction(self,
                               entity: str,
                               entity_text: List[Token],
                               token: Token,
                               token_index: int,
                               tokens: List[Token]) -> float:
        entity_words = set(entity_token.text for entity_token in entity_text)
        if not entity_words:
            # Some tables have empty cells.
            return 0
        seen_entity_words = set()
        token_index_left = token_index
        while token_index < len(tokens) and tokens[token_index].text in entity_words:
            seen_entity_words.add(tokens[token_index].text)
            token_index += 1
        while token_index_left >= 0 and tokens[token_index_left].text in entity_words:
            seen_entity_words.add(tokens[token_index_left].text)
            token_index_left -= 1
        return len(seen_entity_words) / len(entity_words) 
示例17
def _span_lemma_overlap_fraction(self,
                                     entity: str,
                                     entity_text: List[Token],
                                     token: Token,
                                     token_index: int,
                                     tokens: List[Token]) -> float:
        entity_lemmas = set(entity_token.lemma_ for entity_token in entity_text)
        if not entity_lemmas:
            # Some tables have empty cells.
            return 0
        seen_entity_lemmas = set()
        token_index_left = token_index
        while token_index < len(tokens) and tokens[token_index].lemma_ in entity_lemmas:
            seen_entity_lemmas.add(tokens[token_index].lemma_)
            token_index += 1
        while token_index_left >= 0 and tokens[token_index_left].lemma_ in entity_lemmas:
            seen_entity_lemmas.add(tokens[token_index_left].lemma_)
            token_index_left -= 1
        return len(seen_entity_lemmas) / len(entity_lemmas)

    # pylint: enable=unused-argument,no-self-use 
示例18
def text_to_instance(self,  # type: ignore
                         sentence: str,
                         head: str,
                         tail: str,
                         head_type: str=None,
                         tail_type: str=None,
                         label: str=None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        instance_id = f'{head}#{tail}'
        if label:
            instance_id = f'{instance_id}#{label}'

        fields['metadata'] = MetadataField({'instance_id': instance_id.lower()})

        tokens = self._token_splitter.split_words(sentence)
        head = self._token_splitter.split_words(head)
        tail = self._token_splitter.split_words(tail)

        # TODO: this should not be done here

        if self._masking_mode == 'ner_least_specific':
            logger.info(f"Using masking mode 'ner_least_specific'.")
            tokens = ([Token('__start__')]
                      + head + [Token('__del1__')] + head_type + [Token('__ent1__')]
                      + tail + [Token('__del2__')] + tail_type + [Token('__ent2__')]
                      + tokens + [Token('__clf__')])
        else:
            tokens = [Token('__start__')] + head + [Token('__del1__')] + tail + [Token('__del2__')] + tokens + [Token('__clf__')]

        fields['sentence'] = TextField(tokens, self._token_indexers)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields) 
示例19
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
        # If we only use pretrained models, we don't need to do anything here.
        pass 
示例20
def tokens_to_indices(self,
                          tokens: List[Token],
                          vocabulary: Vocabulary,
                          index_name: str) -> Dict[str, List[int]]:
        if not self._added_to_vocabulary:
            self._add_encoding_to_vocabulary(vocabulary)
            self._added_to_vocabulary = True

        text_tokens = []
        offsets = []
        offset = -1

        for token in tokens:
            bpe_tokens = [self.encoder.get(t, 0) for t in self.byte_pair_encode(token) if self.encoder.get(t, 0) != 0]

            if bpe_tokens:
                offset += len(bpe_tokens)
                offsets.append(offset)
                text_tokens.extend(bpe_tokens)

        num_tokens = len(text_tokens)

        # If there's too many tokens, that's going to cause problems.
        if num_tokens >= self.n_ctx:
            print('Sequence to long. Pruning!')
            text_tokens = text_tokens[:self.n_ctx]
            text_tokens[-2] = self.encoder['__clf__</w>']
            text_tokens[-1] = 0
        else:
            text_tokens.append(0)

        return {
                index_name: text_tokens,
                f"{index_name}-offsets": offsets,
                # add mask here according to the original tokens,
                # because calling util.get_text_field_mask on the
                # "byte pair" tokens will produce the wrong shape
                "mask": [1 for _ in offsets]
        } 
示例21
def __init__(self, tokens: List[Token], embs: numpy.ndarray, padding_value: int = 0,
            token_indexers=None) -> None:
        self.tokens = tokens
        self.embs = embs
        self.padding_value = padding_value

        if len(self.tokens) != self.embs.shape[0]:
            raise ValueError("The tokens you passed into the BERTField, {} "
                             "aren't the same size as the embeddings of shape {}".format(self.tokens, self.embs.shape))
        assert len(self.tokens) == self.embs.shape[0] 
示例22
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
        # If we only use pretrained models, we don't need to do anything here.
        pass 
示例23
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
        pass 
示例24
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[List[int]]]:
        # TODO(brendanr): Retain the token to index mappings in the vocabulary and remove this

        # https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113

        texts = [token.text for token in tokens]

        if any(text is None for text in texts):
            raise ConfigurationError(
                "ELMoTokenCharactersIndexer needs a tokenizer that retains text"
            )
        return {"elmo_tokens": [self._mapper.convert_word_to_char_ids(text) for text in texts]} 
示例25
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
        return self._matched_indexer.count_vocab_items(token, counter) 
示例26
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
        # We are using spacy to generate embeddings directly for our model,
        # so we don't need to capture the vocab - it is defined by the spacy
        # model we are using instead.
        pass 
示例27
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
        self._add_encoding_to_vocabulary_if_needed(vocabulary)

        indices, type_ids = self._extract_token_and_type_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        output: IndexedTokenList = {
            "token_ids": indices,
            "mask": [True] * len(indices),
            "type_ids": type_ids,
        }

        return self._postprocess_output(output) 
示例28
def indices_to_tokens(
        self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary
    ) -> List[Token]:
        token_ids = indexed_tokens["token_ids"]
        type_ids = indexed_tokens.get("type_ids")

        return [
            Token(
                text=vocabulary.get_token_from_index(token_ids[i], self._namespace),
                text_id=token_ids[i],
                type_id=type_ids[i] if type_ids is not None else None,
            )
            for i in range(len(token_ids))
        ] 
示例29
def _extract_token_and_type_ids(
        self, tokens: List[Token]
    ) -> Tuple[List[int], Optional[List[int]]]:
        """
        Roughly equivalent to `zip(*[(token.text_id, token.type_id) for token in tokens])`,
        with some checks.
        """
        indices: List[int] = []
        type_ids: List[int] = []
        for token in tokens:
            if getattr(token, "text_id", None) is not None:
                # `text_id` being set on the token means that we aren't using the vocab, we just use
                # this id instead. Id comes from the pretrained vocab.
                # It is computed in PretrainedTransformerTokenizer.
                indices.append(token.text_id)
            else:
                raise KeyError(
                    "Using PretrainedTransformerIndexer but field text_id is not set"
                    f" for the following token: {token.text}"
                )

            if type_ids is not None and getattr(token, "type_id", None) is not None:
                type_ids.append(token.type_id)
            else:
                type_ids.append(0)

        return indices, type_ids 
示例30
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
        if self.namespace is not None:
            text = self._get_feature_value(token)
            if self.lowercase_tokens:
                text = text.lower()
            counter[self.namespace][text] += 1