Python源码示例:torchtext.data()
示例1
def get_fields(data_type, n_src_features, n_tgt_features):
"""
Args:
data_type: type of the source input. Options are [text|img|audio].
n_src_features: the number of source features to
create `torchtext.data.Field` for.
n_tgt_features: the number of target features to
create `torchtext.data.Field` for.
Returns:
A dictionary whose keys are strings and whose values are the
corresponding Field objects.
"""
if data_type == 'text':
return TextDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'img':
return ImageDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'video':
return VideoDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'audio':
return AudioDataset.get_fields(n_src_features, n_tgt_features)
示例2
def _merge_field_vocabs(knl_field, src_field, tgt_field, vocab_size, min_freq):
# in the long run, shouldn't it be possible to do this by calling
# build_vocab with both the src and tgt data?
specials = [tgt_field.unk_token, tgt_field.pad_token,
tgt_field.init_token, tgt_field.eos_token]
merged = sum(
[knl_field.vocab.freqs, src_field.vocab.freqs, tgt_field.vocab.freqs], Counter()
)
merged_vocab = Vocab(
merged, specials=specials,
max_size=vocab_size, min_freq=min_freq
)
knl_field.vocab = merged_vocab
src_field.vocab = merged_vocab
tgt_field.vocab = merged_vocab
assert len(src_field.vocab) == len(tgt_field.vocab) == len(knl_field.vocab)
示例3
def create_batches(self):
""" Create batches """
if self.train:
def _pool(data, random_shuffler):
for p in torchtext.data.batch(data, self.batch_size * 100):
p_batch = torchtext.data.batch(
sorted(p, key=self.sort_key),
self.batch_size, self.batch_size_fn)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = _pool(self.data(), self.random_shuffler)
else:
self.batches = []
for b in torchtext.data.batch(self.data(), self.batch_size,
self.batch_size_fn):
self.batches.append(sorted(b, key=self.sort_key))
示例4
def load_dataloaders(args):
logger.info("Preparing dataloaders...")
FR = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>",\
batch_first=True)
EN = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, batch_first=True)
train_path = os.path.join("./data/", "df.csv")
if not os.path.isfile(train_path):
tokenize_data(args)
train = torchtext.data.TabularDataset(train_path, format="csv", \
fields=[("EN", EN), ("FR", FR)])
FR.build_vocab(train)
EN.build_vocab(train)
train_iter = BucketIterator(train, batch_size=args.batch_size, repeat=False, sort_key=lambda x: (len(x["EN"]), len(x["FR"])),\
shuffle=True, train=True)
train_length = len(train)
logger.info("Loaded dataloaders.")
return train_iter, FR, EN, train_length
示例5
def get_fields(data_type, n_src_features, n_tgt_features):
"""
Args:
data_type: type of the source input. Options are [text|img|audio].
n_src_features: the number of source features to
create `torchtext.data.Field` for.
n_tgt_features: the number of target features to
create `torchtext.data.Field` for.
Returns:
A dictionary whose keys are strings and whose values are the
corresponding Field objects.
"""
if data_type == 'text':
return TextDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'img':
return ImageDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'audio':
return AudioDataset.get_fields(n_src_features, n_tgt_features)
示例6
def _old_style_vocab(vocab):
"""Detect old-style vocabs (``List[Tuple[str, torchtext.data.Vocab]]``).
Args:
vocab: some object loaded from a *.vocab.pt file
Returns:
Whether ``vocab`` is a list of pairs where the second object
is a :class:`torchtext.vocab.Vocab` object.
This exists because previously only the vocab objects from the fields
were saved directly, not the fields themselves, and the fields needed to
be reconstructed at training and translation time.
"""
return isinstance(vocab, list) and \
any(isinstance(v[1], Vocab) for v in vocab)
示例7
def _merge_field_vocabs(src_field, tgt_field, vocab_size, min_freq,
vocab_size_multiple):
# in the long run, shouldn't it be possible to do this by calling
# build_vocab with both the src and tgt data?
specials = [tgt_field.unk_token, tgt_field.pad_token,
tgt_field.init_token, tgt_field.eos_token]
merged = sum(
[src_field.vocab.freqs, tgt_field.vocab.freqs], Counter()
)
merged_vocab = Vocab(
merged, specials=specials,
max_size=vocab_size, min_freq=min_freq
)
if vocab_size_multiple > 1:
_pad_vocab_to_multiple(merged_vocab, vocab_size_multiple)
src_field.vocab = merged_vocab
tgt_field.vocab = merged_vocab
assert len(src_field.vocab) == len(tgt_field.vocab)
示例8
def create_batches(self):
if self.train:
def _pool(data, random_shuffler):
for p in torchtext.data.batch(data, self.batch_size * 100):
p_batch = batch_iter(
sorted(p, key=self.sort_key),
self.batch_size,
batch_size_fn=self.batch_size_fn,
batch_size_multiple=self.batch_size_multiple)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = _pool(self.data(), self.random_shuffler)
else:
self.batches = []
for b in batch_iter(
self.data(),
self.batch_size,
batch_size_fn=self.batch_size_fn,
batch_size_multiple=self.batch_size_multiple):
self.batches.append(sorted(b, key=self.sort_key))
示例9
def get_fields(data_type, n_src_features, n_tgt_features):
"""
Args:
data_type: type of the source input. Options are [text|img|audio].
n_src_features: the number of source features to
create `torchtext.data.Field` for.
n_tgt_features: the number of target features to
create `torchtext.data.Field` for.
Returns:
A dictionary whose keys are strings and whose values are the
corresponding Field objects.
"""
if data_type == 'text':
return TextDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'img':
return ImageDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'audio':
return AudioDataset.get_fields(n_src_features, n_tgt_features)
示例10
def get_fields(data_type, n_src_features, n_tgt_features):
"""
Args:
data_type: type of the source input. Options are [text|img|audio].
n_src_features: the number of source features to
create `torchtext.data.Field` for.
n_tgt_features: the number of target features to
create `torchtext.data.Field` for.
Returns:
A dictionary whose keys are strings and whose values are the
corresponding Field objects.
"""
if data_type == 'text':
return TextDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'img':
return ImageDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'audio':
return AudioDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'gcn':
return GCNDataset.get_fields(n_src_features, n_tgt_features)
示例11
def get_morph(batch):
#Not very nice but we do not have access to value comming from opt.gpuid command line parameter here.
use_cuda = batch.src[0].is_cuda
# morph_index = batch.morph.data.transpose(0, 1) # [ seqLen x batch_size ] ==> [ batch_size x seqLen ]
# morph_voc = batch.dataset.fields['morph'].vocab.stoi
morph_index = batch.morph.view((batch.src[0].data.size()[0], 6, batch.src[0].data.size()[1]))
morph_index = morph_index.permute(2, 0, 1).contiguous()
# morph_index = torch.LongTensor(morph_index)
morph_mask = torch.lt(torch.eq(morph_index, 1), 1).float()
# morph_index = autograd.Variable(morph_index)
# morph_mask = autograd.Variable(torch.FloatTensor(morph_mask), requires_grad=False)
if use_cuda:
morph_index = morph_index.cuda()
morph_mask = morph_mask.cuda()
return morph_index, morph_mask
示例12
def get_fields(data_type, n_src_features, n_tgt_features):
"""
Args:
data_type: type of the source input. Options are [text|img|audio].
n_src_features: the number of source features to
create `torchtext.data.Field` for.
n_tgt_features: the number of target features to
create `torchtext.data.Field` for.
Returns:
A dictionary whose keys are strings and whose values are the
corresponding Field objects.
"""
if data_type == 'text':
return TextDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'img':
return ImageDataset.get_fields(n_src_features, n_tgt_features)
elif data_type == 'audio':
return AudioDataset.get_fields(n_src_features, n_tgt_features)
示例13
def make_features(batch, side):
"""
Args:
batch (Variable): a batch of source or target data.
side (str): for source or for target.
Returns:
A sequence of src/tgt tensors with optional feature tensors
of size (len x batch).
"""
assert side in ['src', 'tgt']
if isinstance(batch.__dict__[side], tuple):
data = batch.__dict__[side][0]
else:
data = batch.__dict__[side]
feat_start = side + "_feat_"
features = sorted(batch.__dict__[k]
for k in batch.__dict__ if feat_start in k)
levels = [data] + features
return torch.cat([level.unsqueeze(2) for level in levels], 2)
示例14
def collapse_copy_scores(self, scores, batch, tgt_vocab):
"""Given scores from an expanded dictionary
corresponeding to a batch, sums together copies,
with a dictionary word when it is ambigious.
"""
offset = len(tgt_vocab)
for b in range(batch.batch_size):
index = batch.indices.data[b]
src_vocab = self.src_vocabs[index]
for i in range(1, len(src_vocab)):
sw = src_vocab.itos[i]
ti = tgt_vocab.stoi[sw]
if ti != 0:
scores[:, b, ti] += scores[:, b, offset + i]
scores[:, b, offset + i].fill_(1e-20)
return scores
示例15
def predict(test_mode, dataset_iter):
model.eval()
dataset_iter.init_epoch()
qids = []
predictions = []
labels = []
for dev_batch_idx, dev_batch in enumerate(dataset_iter):
qid_array = np.transpose(dev_batch.id.cpu().data.numpy())
true_label_array = np.transpose(dev_batch.label.cpu().data.numpy())
output = model.convModel(dev_batch)
scores = model.linearLayer(output)
score_array = scores.cpu().data.numpy().reshape(-1)
qids.extend(qid_array.tolist())
predictions.extend(score_array.tolist())
labels.extend(true_label_array.tolist())
dev_map, dev_mrr = get_map_mrr(qids, predictions, labels)
logger.info("{} {}".format(dev_map, dev_mrr))
# Run the model on the dev set
示例16
def _old_style_vocab(vocab):
"""Detect old-style vocabs (``List[Tuple[str, torchtext.data.Vocab]]``).
Args:
vocab: some object loaded from a *.vocab.pt file
Returns:
Whether ``vocab`` is a list of pairs where the second object
is a :class:`torchtext.vocab.Vocab` object.
This exists because previously only the vocab objects from the fields
were saved directly, not the fields themselves, and the fields needed to
be reconstructed at training and translation time.
"""
return isinstance(vocab, list) and \
any(isinstance(v[1], Vocab) for v in vocab)
示例17
def _merge_field_vocabs(src_field, tgt_field, vocab_size, min_freq,
vocab_size_multiple):
# in the long run, shouldn't it be possible to do this by calling
# build_vocab with both the src and tgt data?
specials = [tgt_field.unk_token, tgt_field.pad_token,
tgt_field.init_token, tgt_field.eos_token]
merged = sum(
[src_field.vocab.freqs, tgt_field.vocab.freqs], Counter()
)
merged_vocab = Vocab(
merged, specials=specials,
max_size=vocab_size, min_freq=min_freq
)
if vocab_size_multiple > 1:
_pad_vocab_to_multiple(merged_vocab, vocab_size_multiple)
src_field.vocab = merged_vocab
tgt_field.vocab = merged_vocab
assert len(src_field.vocab) == len(tgt_field.vocab)
示例18
def create_batches(self):
if self.train:
if self.yield_raw_example:
self.batches = batch_iter(
self.data(),
1,
batch_size_fn=None,
batch_size_multiple=1)
else:
self.batches = _pool(
self.data(),
self.batch_size,
self.batch_size_fn,
self.batch_size_multiple,
self.sort_key,
self.random_shuffler,
self.pool_factor)
else:
self.batches = []
for b in batch_iter(
self.data(),
self.batch_size,
batch_size_fn=self.batch_size_fn,
batch_size_multiple=self.batch_size_multiple):
self.batches.append(sorted(b, key=self.sort_key))
示例19
def make_features(batch, side, data_type='text'):
"""
Args:
batch (Variable): a batch of source or target data.
side (str): for source or for target.
data_type (str): type of the source input. Options are [text|img].
Returns:
A sequence of src/tgt tensors with optional feature tensors
of size (len x batch).
"""
assert side in ['src', 'tgt']
if isinstance(batch.__dict__[side], tuple):
data = batch.__dict__[side][0]
else:
data = batch.__dict__[side]
feat_start = side + "_feat_"
keys = sorted([k for k in batch.__dict__ if feat_start in k])
features = [batch.__dict__[k] for k in keys]
levels = [data] + features
if data_type == 'text':
return torch.cat([level.unsqueeze(2) for level in levels], 2)
else:
return levels[0]
示例20
def coalesce_datasets(datasets):
"""Coalesce all dataset instances. """
final = datasets[0]
for d in datasets[1:]:
# `src_vocabs` is a list of `torchtext.vocab.Vocab`.
# Each sentence transforms into on Vocab.
# Coalesce them into one big list.
final.src_vocabs += d.src_vocabs
# All datasets have same number of features.
aeq(final.n_src_feats, d.n_src_feats)
aeq(final.n_tgt_feats, d.n_tgt_feats)
# `examples` is a list of `torchtext.data.Example`.
# Coalesce them into one big list.
final.examples += d.examples
# All datasets have same fields, no need to update.
return final
示例21
def make_features(batch, side, data_type='text'):
"""
Args:
batch (Variable): a batch of source or target data.
side (str): for source or for target.
data_type (str): type of the source input.
Options are [text|img|audio].
Returns:
A sequence of src/tgt tensors with optional feature tensors
of size (len x batch).
"""
assert side in ['src', 'tgt']
if isinstance(batch.__dict__[side], tuple):
data = batch.__dict__[side][0]
else:
data = batch.__dict__[side]
feat_start = side + "_feat_"
keys = sorted([k for k in batch.__dict__ if feat_start in k])
features = [batch.__dict__[k] for k in keys]
levels = [data] + features
if data_type == 'text':
return torch.cat([level.unsqueeze(2) for level in levels], 2)
else:
return levels[0]
示例22
def create_batches(self):
if self.train:
def pool(data, random_shuffler):
for p in torchtext.data.batch(data, self.batch_size * 100):
p_batch = torchtext.data.batch(
sorted(p, key=self.sort_key),
self.batch_size, self.batch_size_fn)
for b in random_shuffler(list(p_batch)):
yield b
self.batches = pool(self.data(), self.random_shuffler)
else:
self.batches = []
for b in torchtext.data.batch(self.data(), self.batch_size,
self.batch_size_fn):
self.batches.append(sorted(b, key=self.sort_key))
示例23
def make_src(data, vocab):
src_size = max([t.size(0) for t in data])
src_vocab_size = max([t.max() for t in data]) + 1
alignment = torch.zeros(src_size, len(data), src_vocab_size)
for i, sent in enumerate(data):
for j, t in enumerate(sent):
alignment[j, i, t] = 1
return alignment
示例24
def make_tgt(data, vocab):
tgt_size = max([t.size(0) for t in data])
alignment = torch.zeros(tgt_size, len(data)).long()
for i, sent in enumerate(data):
alignment[:sent.size(0), i] = sent
return alignment
示例25
def make_img(data, vocab):
c = data[0].size(0)
h = max([t.size(1) for t in data])
w = max([t.size(2) for t in data])
imgs = torch.zeros(len(data), c, h, w).fill_(1)
for i, img in enumerate(data):
imgs[i, :, 0:img.size(1), 0:img.size(2)] = img
return imgs
示例26
def make_audio(data, vocab):
""" batch audio data """
nfft = data[0].size(0)
t = max([t.size(1) for t in data])
sounds = torch.zeros(len(data), 1, nfft, t)
for i, spect in enumerate(data):
sounds[i, :, :, 0:spect.size(1)] = spect
return sounds
示例27
def build_dataset_iter(datasets, fields, opt, is_train=True):
"""
This returns user-defined train/validate data iterator for the trainer
to iterate over. We implement simple ordered iterator strategy here,
but more sophisticated strategy like curriculum learning is ok too.
"""
batch_size = opt.batch_size if is_train else opt.valid_batch_size
if is_train and opt.batch_type == "tokens":
def batch_size_fn(new, count, sofar):
"""
In token batching scheme, the number of sequences is limited
such that the total number of src/tgt tokens (including padding)
in a batch <= batch_size
"""
# Maintains the longest src and tgt length in the current batch
global max_src_in_batch, max_tgt_in_batch
# Reset current longest length at a new batch (count=1)
if count == 1:
max_src_in_batch = 0
max_tgt_in_batch = 0
# Src: <bos> w1 ... wN <eos>
max_src_in_batch = max(max_src_in_batch, len(new.src) + 2)
# Tgt: w1 ... wN <eos>
max_tgt_in_batch = max(max_tgt_in_batch, len(new.tgt) + 1)
src_elements = count * max_src_in_batch
tgt_elements = count * max_tgt_in_batch
return max(src_elements, tgt_elements)
else:
batch_size_fn = None
device = "cuda" if opt.gpu_ranks else "cpu"
return DatasetLazyIter(datasets, fields, batch_size, batch_size_fn,
device, is_train)
示例28
def lazily_load_dataset(corpus_type, opt):
"""
Dataset generator. Don't do extra stuff here, like printing,
because they will be postponed to the first loading time.
Args:
corpus_type: 'train' or 'valid'
Returns:
A list of dataset, the dataset(s) are lazily loaded.
"""
assert corpus_type in ["train", "valid"]
def _lazy_dataset_loader(pt_file, corpus_type):
dataset = torch.load(pt_file)
logger.info('Loading %s dataset from %s, number of examples: %d' %
(corpus_type, pt_file, len(dataset)))
return dataset
# Sort the glob output by file name (by increasing indexes).
pts = sorted(glob.glob(opt.data + '.' + corpus_type + '.[0-9]*.pt'))
if pts:
for pt in pts:
yield _lazy_dataset_loader(pt, corpus_type)
else:
# Only one inputters.*Dataset, simple!
pt = opt.data + '.' + corpus_type + '.pt'
yield _lazy_dataset_loader(pt, corpus_type)
示例29
def from_files(cls, fields, files):
"""Create a QualityEstimationDataset given paths and fields.
Arguments:
fields: A dict between field name and field object.
files: A dict between field name and file dict (with 'name' and
'format' keys).
"""
fields_examples = []
dataset_fields = []
# first load the data for each field
for attrib_name, field in fields.items():
file_dict = files[attrib_name]
file_name = file_dict['name']
reader = file_dict['reader']
if not reader:
with open(file_name, 'r', encoding='utf8') as f:
fields_values_for_example = [line.strip() for line in f]
else:
fields_values_for_example = reader(file_name)
fields_examples.append(fields_values_for_example)
dataset_fields.append((attrib_name, field))
# then add each corresponding sentence from each field
nb_lines = [len(fe) for fe in fields_examples]
assert min(nb_lines) == max(nb_lines) # Assert files have the same size
return cls(fields_examples, dataset_fields)
示例30
def __iter__(self):
for j in range(self.number_of_examples):
fields_values_for_example = [
self.fields_examples[i][j]
for i in range(len(self.dataset_fields))
]
yield data.Example.fromlist(
fields_values_for_example, self.dataset_fields
)