Python源码示例:allennlp.data.vocabulary.Vocabulary()
示例1
def __init__(self,
vocabulary: Vocabulary,
namespace: str = "intent_labels",
ignore_classes: List[str] = None,
coarse: bool = True) -> None:
"""
Parameters
----------
vocabulary : ``Vocabulary``, required.
A vocabulary containing the label namespace.
namespace : str, required.
The vocabulary namespace for labels.
ignore_classes : List[str], optional.
Labels which will be ignored when computing metrics.
"""
self._label_vocabulary = vocabulary.get_index_to_token_vocabulary(namespace)
self._ignore_classes: List[str] = ignore_classes or []
self._coarse = coarse
# These will hold per label span counts.
self._true_positives: Dict[str, int] = defaultdict(int)
self._false_positives: Dict[str, int] = defaultdict(int)
self._false_negatives: Dict[str, int] = defaultdict(int)
示例2
def __init__(self,
embedder: TextFieldEmbedder,
vocab: Vocabulary,
lm_head: LanguageModelHead=None,
clf_head: ClassificationHead=None,
language_model_weight: float=.5) -> None:
super().__init__(vocab)
assert not (lm_head is None and clf_head is None)
self.embedder = embedder
self.clf_head = clf_head
self.lm_head = lm_head
self.language_model_weight = language_model_weight
self.vocab = vocab
示例3
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
self._matched_indexer._add_encoding_to_vocabulary_if_needed(vocabulary)
wordpieces, offsets = self._allennlp_tokenizer.intra_word_tokenize([t.text for t in tokens])
# For tokens that don't correspond to any word pieces, we put (-1, -1) into the offsets.
# That results in the embedding for the token to be all zeros.
offsets = [x if x is not None else (-1, -1) for x in offsets]
output: IndexedTokenList = {
"token_ids": [t.text_id for t in wordpieces],
"mask": [True] * len(tokens), # for original tokens (i.e. word-level)
"type_ids": [t.type_id for t in wordpieces],
"offsets": offsets,
"wordpiece_mask": [True] * len(wordpieces), # for wordpieces (i.e. subword-level)
}
return self._matched_indexer._postprocess_output(output)
示例4
def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
"""
Copies tokens from ```transformers``` model's vocab to the specified namespace.
"""
if self._added_to_vocabulary:
return
try:
vocab_items = self._tokenizer.get_vocab().items()
except NotImplementedError:
vocab_items = (
(self._tokenizer.convert_ids_to_tokens(idx), idx)
for idx in range(self._tokenizer.vocab_size)
)
for word, idx in vocab_items:
vocab._token_to_index[self._namespace][word] = idx
vocab._index_to_token[self._namespace][idx] = word
self._added_to_vocabulary = True
示例5
def tokens_to_indices(
self, tokens: List[Token], vocabulary: Vocabulary
) -> Dict[str, List[int]]:
indices: List[int] = []
for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
text = self._get_feature_value(token)
if self.namespace is None:
# We could have a check here that `text` is an int; not sure it's worth it.
indices.append(text) # type: ignore
else:
if self.lowercase_tokens:
text = text.lower()
indices.append(vocabulary.get_token_index(text, self.namespace))
return {"tokens": indices}
示例6
def tokens_to_indices(
self, tokens: List[Token], vocabulary: Vocabulary
) -> Dict[str, List[List[int]]]:
indices: List[List[int]] = []
for token in itertools.chain(self._start_tokens, tokens, self._end_tokens):
token_indices: List[int] = []
if token.text is None:
raise ConfigurationError(
"TokenCharactersIndexer needs a tokenizer that retains text"
)
for character in self._character_tokenizer.tokenize(token.text):
if getattr(character, "text_id", None) is not None:
# `text_id` being set on the token means that we aren't using the vocab, we just
# use this id instead.
index = character.text_id
else:
index = vocabulary.get_token_index(character.text, self._namespace)
token_indices.append(index)
indices.append(token_indices)
return {"token_characters": indices}
示例7
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
"""
If there are strings in this field that need to be converted into integers through a
:class:`Vocabulary`, here is where we count them, to determine which tokens are in or out
of the vocabulary.
If your `Field` does not have any strings that need to be converted into indices, you do
not need to implement this method.
A note on this `counter`: because `Fields` can represent conceptually different things,
we separate the vocabulary items by `namespaces`. This way, we can use a single shared
mechanism to handle all mappings from strings to integers in all fields, while keeping
words in a `TextField` from sharing the same ids with labels in a `LabelField` (e.g.,
"entailment" or "contradiction" are labels in an entailment task)
Additionally, a single `Field` might want to use multiple namespaces - `TextFields` can
be represented as a combination of word ids and character ids, and you don't want words and
characters to share the same vocabulary - "a" as a word should get a different id from "a"
as a character, and the vocabulary sizes of words and characters are very different.
Because of this, the first key in the `counter` object is a `namespace`, like "tokens",
"token_characters", "tags", or "labels", and the second key is the actual vocabulary item.
"""
pass
示例8
def test_multilabel_field_empty_field_works(self):
vocab = Vocabulary()
vocab.add_token_to_namespace("label1", namespace="test_empty_labels")
vocab.add_token_to_namespace("label2", namespace="test_empty_labels")
f = MultiLabelField([], label_namespace="test_empty_labels")
f.index(vocab)
tensor = f.as_tensor(f.get_padding_lengths()).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
g = f.empty_field()
g.index(vocab)
tensor = g.as_tensor(g.get_padding_lengths()).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0]))
h = MultiLabelField(
[0, 0, 1], label_namespace="test_empty_labels", num_labels=3, skip_indexing=True
)
tensor = h.empty_field().as_tensor(None).detach().cpu().numpy()
numpy.testing.assert_array_almost_equal(tensor, numpy.array([0, 0, 0]))
示例9
def __init__(self,
word_embeddings: TextFieldEmbedder,
vocab: Vocabulary,
lstm_hidden_dim: int,
top_k: int,
cuda_device: int) -> None:
super().__init__(vocab)
self.word_embeddings = word_embeddings
self.query_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)
self.doc_rep = nn.LSTM(self.word_embeddings.get_output_dim(),lstm_hidden_dim,batch_first=True,bidirectional=True)
# this does not really do "attention" - just a plain cosine matrix calculation (without learnable weights)
self.cosine_module = CosineMatrixAttention()
self.top_k = top_k
self.dense = nn.Linear(top_k, out_features=20, bias=True)
self.dense2 = nn.Linear(20, out_features=20, bias=True)
self.dense3 = nn.Linear(20, out_features=1, bias=False)
示例10
def __init__(self,
vocab: Vocabulary,
word_embedder: TextFieldEmbedder,
character_embedder: TextFieldEmbedder,
encoder: Seq2SeqEncoder,
character_encoder: Seq2VecEncoder) -> None:
super().__init__(vocab)
self._word_embedder = word_embedder
self._character_embedder = character_embedder
self._character_encoder = character_encoder
self._encoder = encoder
self._classifier = torch.nn.Linear(
in_features=encoder.get_output_dim(),
out_features=vocab.get_vocab_size('labels')
)
self._f1 = SpanBasedF1Measure(vocab, 'labels')
示例11
def __init__(self,
vocab: Vocabulary,
embedder: TextFieldEmbedder,
encoder: Seq2SeqEncoder) -> None:
super().__init__(vocab)
self._embedder = embedder
self._encoder = encoder
self._classifier = torch.nn.Linear(
in_features=encoder.get_output_dim(),
out_features=vocab.get_vocab_size('labels')
)
self._crf = ConditionalRandomField(
vocab.get_vocab_size('labels')
)
self._f1 = SpanBasedF1Measure(vocab, 'labels')
示例12
def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
# pylint: disable=protected-access
for word, idx in self.encoder.items():
vocabulary._token_to_index[self._namespace][word] = idx
vocabulary._index_to_token[self._namespace][idx] = word
示例13
def tokens_to_indices(self,
tokens: List[Token],
vocabulary: Vocabulary,
index_name: str) -> Dict[str, List[int]]:
if not self._added_to_vocabulary:
self._add_encoding_to_vocabulary(vocabulary)
self._added_to_vocabulary = True
text_tokens = []
offsets = []
offset = -1
for token in tokens:
bpe_tokens = [self.encoder.get(t, 0) for t in self.byte_pair_encode(token) if self.encoder.get(t, 0) != 0]
if bpe_tokens:
offset += len(bpe_tokens)
offsets.append(offset)
text_tokens.extend(bpe_tokens)
num_tokens = len(text_tokens)
# If there's too many tokens, that's going to cause problems.
if num_tokens >= self.n_ctx:
print('Sequence to long. Pruning!')
text_tokens = text_tokens[:self.n_ctx]
text_tokens[-2] = self.encoder['__clf__</w>']
text_tokens[-1] = 0
else:
text_tokens.append(0)
return {
index_name: text_tokens,
f"{index_name}-offsets": offsets,
# add mask here according to the original tokens,
# because calling util.get_text_field_mask on the
# "byte pair" tokens will produce the wrong shape
"mask": [1 for _ in offsets]
}
示例14
def _add_encoding_to_vocabulary(self, vocabulary: Vocabulary) -> None:
# pylint: disable=protected-access
for word, idx in self.vocab.items():
vocabulary._token_to_index[self._namespace][word] = idx
vocabulary._index_to_token[self._namespace][idx] = word
示例15
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
self._add_encoding_to_vocabulary_if_needed(vocabulary)
indices, type_ids = self._extract_token_and_type_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
output: IndexedTokenList = {
"token_ids": indices,
"mask": [True] * len(indices),
"type_ids": type_ids,
}
return self._postprocess_output(output)
示例16
def indices_to_tokens(
self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary
) -> List[Token]:
token_ids = indexed_tokens["token_ids"]
type_ids = indexed_tokens.get("type_ids")
return [
Token(
text=vocabulary.get_token_from_index(token_ids[i], self._namespace),
text_id=token_ids[i],
type_id=type_ids[i] if type_ids is not None else None,
)
for i in range(len(token_ids))
]
示例17
def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList:
"""
Takes a list of tokens and converts them to an `IndexedTokenList`.
This could be just an ID for each token from the vocabulary.
Or it could split each token into characters and return one ID per character.
Or (for instance, in the case of byte-pair encoding) there might not be a clean
mapping from individual tokens to indices, and the `IndexedTokenList` could be a complex
data structure.
"""
raise NotImplementedError
示例18
def indices_to_tokens(
self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary
) -> List[Token]:
"""
Inverse operations of tokens_to_indices. Takes an `IndexedTokenList` and converts it back
into a list of tokens.
"""
raise NotImplementedError
示例19
def index_instances(self, vocab: Vocabulary) -> None:
for instance in self.instances:
instance.index_fields(vocab)
示例20
def __init__(self, instances: List[Instance], vocab: Vocabulary = None):
self.instances = instances
self.vocab = vocab
示例21
def __init__(
self,
instance_generator: Callable[[str], Iterable[Instance]],
file_path: str,
vocab: Vocabulary = None,
) -> None:
super().__init__()
self._instance_generator = instance_generator
self._file_path = file_path
self.vocab = vocab
示例22
def index_with(self, vocab: Vocabulary):
self.vocab = vocab
示例23
def add_field(self, field_name: str, field: Field, vocab: Vocabulary = None) -> None:
"""
Add the field to the existing fields mapping.
If we have already indexed the Instance, then we also index `field`, so
it is necessary to supply the vocab.
"""
self.fields[field_name] = field
if self.indexed:
field.index(vocab)
示例24
def index_fields(self, vocab: Vocabulary) -> None:
"""
Indexes all fields in this `Instance` using the provided `Vocabulary`.
This `mutates` the current object, it does not return a new `Instance`.
A `DataLoader` will call this on each pass through a dataset; we use the `indexed`
flag to make sure that indexing only happens once.
This means that if for some reason you modify your vocabulary after you've
indexed your instances, you might get unexpected behavior.
"""
if not self.indexed:
self.indexed = True
for field in self.fields.values():
field.index(vocab)
示例25
def index(self, vocab: Vocabulary):
self._mapping_array = [
vocab.get_token_index(x.text, self._target_namespace) for x in self._source_tokens
]
示例26
def index(self, vocab: Vocabulary):
for field in self.field_list:
field.index(vocab)
示例27
def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")):
if label_namespace not in self._already_warned_namespaces:
logger.warning(
"Your label namespace was '%s'. We recommend you use a namespace "
"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
"default to your vocabulary. See documentation for "
"`non_padded_namespaces` parameter in Vocabulary.",
self._label_namespace,
)
self._already_warned_namespaces.add(label_namespace)
示例28
def index(self, vocab: Vocabulary):
if not self._skip_indexing:
self._label_id = vocab.get_token_index(
self.label, self._label_namespace # type: ignore
)
示例29
def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")):
if label_namespace not in self._already_warned_namespaces:
logger.warning(
"Your label namespace was '%s'. We recommend you use a namespace "
"ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
"default to your vocabulary. See documentation for "
"`non_padded_namespaces` parameter in Vocabulary.",
self._label_namespace,
)
self._already_warned_namespaces.add(label_namespace)
示例30
def index(self, vocab: Vocabulary):
if self.labels is not None:
self._indexed_labels = [
vocab.get_token_index(label, self._label_namespace) for label in self.labels
]