Python源码示例:bert.tokenization.FullTokenizer()
示例1
def __init__(self, id, args, worker_address, sink_address):
super().__init__()
self.model_dir = args.model_dir
self.config_fp = os.path.join(self.model_dir, 'bert_config.json')
self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt')
self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt')
self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
self.max_seq_len = args.max_seq_len
self.worker_id = id
self.daemon = True
self.model_fn = model_fn_builder(
bert_config=modeling.BertConfig.from_json_file(self.config_fp),
init_checkpoint=self.checkpoint_fp,
pooling_strategy=args.pooling_strategy,
pooling_layer=args.pooling_layer
)
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
self.estimator = Estimator(self.model_fn, config=RunConfig(session_config=config))
self.exit_flag = multiprocessing.Event()
self.logger = set_logger('WORKER-%d' % self.worker_id)
self.worker_address = worker_address
self.sink_address = sink_address
示例2
def test_full_tokenizer(self):
vocab_tokens = [
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", ","
]
with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
vocab_file = vocab_writer.name
tokenizer = tokenization.FullTokenizer(vocab_file)
os.unlink(vocab_file)
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertAllEqual(
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
示例3
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--review-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/reviews')
parser.add_argument('--parsedpdf-json-dir', type=str, default='../dat/PeerRead/arxiv.all/all/parsed_pdfs')
parser.add_argument('--out-dir', type=str, default='../dat/PeerRead/proc')
parser.add_argument('--out-file', type=str, default='arxiv-all.tf_record')
parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
parser.add_argument('--max-abs-len', type=int, default=250)
parser.add_argument('--venue', type=int, default=0)
parser.add_argument('--year', type=int, default=2017)
args = parser.parse_args()
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=True)
clean_PeerRead_dataset(args.review_json_dir, args.parsedpdf_json_dir,
args.venue, args.year,
args.out_dir, args.out_file,
args.max_abs_len, tokenizer, is_arxiv=True)
示例4
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--data-dir', type=str, default=None)
parser.add_argument('--out-dir', type=str, default='../dat/reddit')
parser.add_argument('--out-file', type=str, default='proc.tf_record')
parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
parser.add_argument('--max-abs-len', type=int, default=128)
parser.add_argument('--subsample', type=int, default=0)
parser.add_argument('--use-latest-reddit', type=bool, default=True)
args = parser.parse_args()
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=True)
process_reddit_dataset(args.data_dir, args.out_dir, args.out_file,
args.max_abs_len, tokenizer, args.subsample, args.use_latest_reddit)
示例5
def __init__(self, model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt",
bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json",
vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt",
max_seq_length=32, dimension=768, num_labels=2, use_notebook=False):
super().__init__("bert", dimension, use_notebook)
config = BertConfig.from_json_file(bertconfig_fname)
self.max_seq_length = max_seq_length
self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(config,
max_seq_length,
1.0,
num_labels,
tune=False)
saver = tf.train.Saver(tf.global_variables())
self.sess = tf.Session()
checkpoint_path = tf.train.latest_checkpoint(model_fname)
saver.restore(self.sess, checkpoint_path)
示例6
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')
bert_config = BertConfig.from_json_file(bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
bert_config.print_status()
model_bert = BertModel(bert_config)
if no_pretraining:
pass
else:
model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
print("Load pre-trained parameters.")
model_bert.to(device)
return model_bert, tokenizer, bert_config
示例7
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')
bert_config = BertConfig.from_json_file(bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
bert_config.print_status()
model_bert = BertModel(bert_config)
if no_pretraining:
pass
else:
model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
print("Load pre-trained parameters.")
model_bert.to(device)
return model_bert, tokenizer, bert_config
示例8
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')
bert_config = BertConfig.from_json_file(bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
bert_config.print_status()
model_bert = BertModel(bert_config)
if no_pretraining:
pass
else:
model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
print("Load pre-trained parameters.")
model_bert.to(device)
return model_bert, tokenizer, bert_config
示例9
def __init__(self, label_map, vocab_file,
max_seq_length, do_lower_case,
converter):
"""Initializes an instance of BertExampleBuilder.
Args:
label_map: Mapping from tags to tag IDs.
vocab_file: Path to BERT vocabulary file.
max_seq_length: Maximum sequence length.
do_lower_case: Whether to lower case the input text. Should be True for
uncased models and False for cased models.
converter: Converter from text targets to tags.
"""
self._label_map = label_map
self._tokenizer = tokenization.FullTokenizer(vocab_file,
do_lower_case=do_lower_case)
self._max_seq_length = max_seq_length
self._converter = converter
self._pad_id = self._get_pad_id()
self._keep_tag_id = self._label_map['KEEP']
示例10
def test_full_tokenizer(self):
vocab_tokens = [
"[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
"##ing", ","
]
with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
if six.PY2:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
else:
vocab_writer.write("".join(
[x + "\n" for x in vocab_tokens]).encode("utf-8"))
vocab_file = vocab_writer.name
tokenizer = tokenization.FullTokenizer(vocab_file)
os.unlink(vocab_file)
tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertAllEqual(
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
示例11
def create_tokenizer_from_hub_module():
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL_HUB)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
示例12
def __init__(self, vocab_file=None, **kwargs):
super().__init__()
if vocab_file is None:
raise ValueError(
'Vocabulary file is required to initialize BERT tokenizer'
)
try:
from bert.tokenization import FullTokenizer
except ImportError:
raise ValueError(
"Please install bert-tensorflow: pip install bert-tensorflow"
)
self.tokenizer = FullTokenizer(vocab_file)
示例13
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Reading from input files ***")
for input_file in input_files:
tf.logging.info(" %s", input_file)
rng = random.Random(FLAGS.random_seed)
instances = create_training_instances(
input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
rng)
output_files = FLAGS.output_file.split(",")
tf.logging.info("*** Writing to output files ***")
for output_file in output_files:
tf.logging.info(" %s", output_file)
write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
FLAGS.max_predictions_per_seq, output_files)
示例14
def buzzy_title_based_sim_dfs(treat_strength, con_strength, noise_level, setting="simple", seed=0,
base_output_dir='../dat/sim/peerread_buzzytitle_based/'):
labeler = make_buzzy_based_simulated_labeler(treat_strength, con_strength, noise_level, setting=setting, seed=seed)
num_splits = 10
dev_splits = [0]
test_splits = [0]
# data_file = '../dat/reddit/proc.tf_record'
# vocab_file = "../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt"
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True)
input_dataset_from_filenames = make_input_fn_from_file(data_file,
250,
num_splits,
dev_splits,
test_splits,
tokenizer,
is_training=False,
filter_test=False,
shuffle_buffer_size=25000,
seed=seed,
labeler=labeler)
output_df = dataset_fn_to_df(input_dataset_from_filenames)
output_df = output_df.rename(index=str, columns={'theorem_referenced': 'treatment'})
output_dir = os.path.join(base_output_dir, "mode{}".format(setting))
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "beta0{}.beta1{}.gamma{}.tsv".format(treat_strength, con_strength, noise_level))
output_df.to_csv(output_path, '\t')
示例15
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--datasets-dir', type=str, default='../dat/PeerRead')
parser.add_argument('--vocab-file', type=str, default='../../bert/pre-trained/uncased_L-12_H-768_A-12/vocab.txt')
args = parser.parse_args()
datasets_dir = args.datasets_dir
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=True)
def proc_dataset(dataset):
all_dir = os.path.join(datasets_dir, dataset_paths[dataset], 'all')
review_json_dir = os.path.join(all_dir, 'reviews')
parsedpdf_json_dir = os.path.join(all_dir, 'parsed_pdfs')
venue = dataset_venues[dataset]
year = dataset_years[dataset]
out_dir = os.path.join(datasets_dir, 'proc')
out_file = dataset + '.tf_record'
max_abs_len = 250
clean_PeerRead_dataset(review_json_dir, parsedpdf_json_dir, venue, year, out_dir, out_file, max_abs_len,
tokenizer)
# pool = mp.Pool(4)
# pool.map(proc_dataset, dataset_names)
for dataset in dataset_names:
proc_dataset(dataset)
示例16
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--review-json-dir', type=str, default=None)
parser.add_argument('--vocab-file', type=str, default=None)
args = parser.parse_args()
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=True)
review_json_dir = args.review_json_dir
print('Reading reviews from...', review_json_dir)
paper_json_filenames = sorted(glob.glob('{}/*.json'.format(review_json_dir)))
paper_json_filename = paper_json_filenames[0]
with io.open(paper_json_filename) as json_file:
loaded = json.load(json_file)
abstract = loaded['abstract']
print(abstract)
tokens = tokenizer.tokenize(abstract)
print(tokens)
print(tokenizer.convert_tokens_to_ids(tokens))
# for idx, paper_json_filename in enumerate(paper_json_filenames):
# with io.open(paper_json_filename) as json_file:
# loaded = json.load(json_file)
#
# print(loaded['abstract'])
示例17
def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None,
test_corpus_fname=None, tokenized_test_corpus_fname=None,
model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000,
batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None,
sp_model_path=None):
# configurations
tf.logging.set_verbosity(tf.logging.INFO)
self.model_name = model_name
self.eval_every = eval_every
self.model_ckpt_path = model_ckpt_path
self.model_save_path = model_save_path
self.batch_size = batch_size
self.num_epochs = num_epochs
self.dropout_keep_prob_rate = dropout_keep_prob_rate
self.best_valid_score = 0.0
if not os.path.exists(model_save_path):
os.mkdir(model_save_path)
# define tokenizer
if self.model_name == "bert":
self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
elif self.model_name == "xlnet":
sp = spm.SentencePieceProcessor()
sp.Load(sp_model_path)
self.tokenizer = sp
else:
self.tokenizer = get_tokenizer("mecab")
# load or tokenize corpus
self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname)
self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
示例18
def bert_tokenize(vocab_fname, corpus_fname, output_fname):
tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
with open(corpus_fname, 'r', encoding='utf-8') as f1, \
open(output_fname, 'w', encoding='utf-8') as f2:
for line in f1:
sentence = line.replace('\n', '').strip()
tokens = tokenizer.tokenize(convert_to_unicode(sentence))
tokenized_sent = ' '.join(tokens)
f2.writelines(tokenized_sent + '\n')
示例19
def _create_tokenizer_from_hub_module(uri):
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = tf_hub.Module(uri, trainable=False)
tokenization_info = bert_module(
signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run(
[
tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]
])
return FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
示例20
def load_model(self, model: str, model_path: str, max_seq_length: int):
g = tf.Graph()
with g.as_default():
self.input_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
self.input_masks = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
self.segment_ids = tf.placeholder(dtype=tf.int32, shape=[None, max_seq_length])
hub_module = hub.Module(model_path)
bert_inputs = dict(
input_ids=self.input_ids,
input_mask=self.input_masks,
segment_ids=self.segment_ids
)
self.bert_outputs = hub_module(bert_inputs, signature="tokens", as_dict=True)
tokenization_info = hub_module(signature="tokenization_info", as_dict=True)
init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()
self.sess = tf.Session(graph=g)
self.sess.run(init_op)
vocab_file, do_lower_case = self.sess.run(
[
tokenization_info["vocab_file"],
tokenization_info["do_lower_case"],
]
)
Embeddings.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
self.max_seq_length = max_seq_length
self.model_name = model
print("Model loaded Successfully !")
示例21
def __init__(self, batch_size=args.batch_size):
self.mode = None
self.max_seq_length = args.max_seq_len
self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
self.batch_size = batch_size
self.estimator = None
self.processor = SimProcessor()
tf.logging.set_verbosity(tf.logging.INFO)
示例22
def main(_):
tf.logging.set_verbosity(tf.logging.INFO)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
input_files = []
for input_pattern in FLAGS.input_file.split(","):
input_files.extend(tf.gfile.Glob(input_pattern))
tf.logging.info("*** Reading from input files ***")
for input_file in input_files:
tf.logging.info(" %s", input_file)
rng = random.Random(FLAGS.random_seed)
instances = create_training_instances(
input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
rng)
output_files = FLAGS.output_file.split(",")
tf.logging.info("*** Writing to output files ***")
for output_file in output_files:
tf.logging.info(" %s", output_file)
write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
FLAGS.max_predictions_per_seq, output_files)
示例23
def create_tokenizer_from_hub_module(bert_hub_module_handle):
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(bert_hub_module_handle)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
示例24
def preproc_doc(document):
"""Convert document to list of TF Examples for binary order classification.
Args:
document: a CCNews article (ie. a list of sentences)
Returns:
A list of tfexamples of binary orderings of pairs of sentences in the
document. The tfexamples are serialized to string to be written directly
to TFRecord.
"""
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
# document = [
# tokenization.convert_to_unicode(
# unidecode.unidecode(line.decode("utf-8"))) for line in document
# ]
sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent]
sent_tokens = [sent for sent in sent_tokens if len(sent) > 1]
if len(sent_tokens) < 8:
return []
# Convert token lists into ids and add any needed tokens and padding for BERT
tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens,
FLAGS.max_sent_length,
FLAGS.max_para_length)
# Serialize TFExample for writing to file.
tf_examples = [tf_example.SerializeToString()]
return tf_examples
示例25
def preproc_doc(document):
"""Convert document to list of TF Examples for binary order classification.
Args:
document: a CCNews article (ie. a list of sentences)
Returns:
A list of tfexamples of binary orderings of pairs of sentences in the
document. The tfexamples are serialized to string to be written directly
to TFRecord.
"""
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
document = [
tokenization.convert_to_unicode(
unidecode.unidecode(line.decode("utf-8"))) for line in document
]
sent_tokens = [tokenizer.tokenize(sent) for sent in document if sent]
sent_tokens = [sent for sent in sent_tokens if len(sent) > 1]
if len(sent_tokens) < 8:
return []
# Convert token lists into ids and add any needed tokens and padding for BERT
tf_example = convert_instance_to_tf_example(tokenizer, sent_tokens,
FLAGS.max_sent_length,
FLAGS.max_para_length)
# Serialize TFExample for writing to file.
tf_examples = [tf_example.SerializeToString()]
return tf_examples
示例26
def main(unused_argv):
tokenizer = FullTokenizer(FLAGS.tokenizer_vocabulary)
print('Loading ' + str(FLAGS.dataset_name) + ' dataset from ' +
FLAGS.input_filepath)
# The debugging file saves all of the processed SQL queries.
debugging_file = gfile.Open(
os.path.join('/'.join(FLAGS.output_filepath.split('/')[:-1]),
FLAGS.dataset_name + '_'.join(FLAGS.splits) + '_gold.txt'),
'w')
# The output file will save a sequence of string-serialized JSON objects, one
# line per object.
output_file = gfile.Open(os.path.join(FLAGS.output_filepath), 'w')
if FLAGS.dataset_name.lower() == 'spider':
num_examples_created, num_examples_failed = process_spider(
output_file, debugging_file, tokenizer)
elif FLAGS.dataset_name.lower() == 'wikisql':
num_examples_created, num_examples_failed = process_wikisql(
output_file, debugging_file, tokenizer)
else:
num_examples_created, num_examples_failed = process_michigan_datasets(
output_file, debugging_file, tokenizer)
print('Wrote %s examples, could not annotate %s examples.' %
(num_examples_created, num_examples_failed))
debugging_file.write('Wrote %s examples, could not annotate %s examples.' %
(num_examples_created, num_examples_failed))
debugging_file.close()
output_file.close()
示例27
def main(_):
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
examples = read_examples(input_file=FLAGS.input_file)
# Pre-shuffle the input to avoid having to make a very large shuffle
# buffer in in the `input_fn`.
rng = random.Random(12345)
rng.shuffle(examples)
# We write to a temporary file to avoid storing very large
# constant tensors in memory.
writer = FeatureWriter(filename=FLAGS.output_file)
features = []
def append_feature(feature):
features.append(feature)
writer.process_feature(feature)
convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_doc_length=FLAGS.max_seq_length,
doc_stride=FLAGS.doc_stride,
max_query_length=FLAGS.max_query_length,
output_fn=append_feature)
writer.close()
tf.logging.info("%d original examples read.", len(examples))
tf.logging.info("%d split records written.", writer.num_features)
if FLAGS.feature_file is not None:
json.dump([[vars(ee) for ee in examples], [vars(ff) for ff in features]],
tf.gfile.Open(FLAGS.feature_file, "w"))
示例28
def main(_):
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
examples = read_examples(input_file=FLAGS.input_file)
# Pre-shuffle the input to avoid having to make a very large shuffle
# buffer in in the `input_fn`.
rng = random.Random(12345)
rng.shuffle(examples)
# We write to a temporary file to avoid storing very large
# constant tensors in memory.
writer = FeatureWriter(filename=FLAGS.output_file)
features = []
def append_feature(feature):
features.append(feature)
writer.process_feature(feature)
convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_doc_length=FLAGS.max_seq_length,
doc_stride=FLAGS.doc_stride,
max_query_length=FLAGS.max_query_length,
output_fn=append_feature)
writer.close()
tf.logging.info("%d original examples read.", len(examples))
tf.logging.info("%d split records written.", writer.num_features)
if FLAGS.feature_file is not None:
json.dump([[vars(ee) for ee in examples], [vars(ff) for ff in features]],
tf.gfile.Open(FLAGS.feature_file, "w"))
示例29
def __init__(self, is_training):
self.is_training = is_training
self.tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
示例30
def bert_preprocess(filename, vocab):
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab, do_lower_case=False)
new_filename = filename + ".bert"
f1 = open(new_filename, 'w')
per_count = 0
with open(filename, "r") as f:
lines = f.readlines()
for line in lines:
str1 = line.split("\t")[1]
label1 = line.split("\t")[0]
new_label_list = []
old_label_list = label1.split(' ')
word_list = str1.split(' ')
tokens = []
tokens.append('[CLS]')
new_label_list.append('O')
per_count = 0
for i, (w, t) in enumerate(zip(word_list, old_label_list)):
token = tokenizer.tokenize(w)
tokens.extend(token)
for i, _ in enumerate(token):
if i == 0:
new_label_list.append(t)
else:
new_label_list.append("X")
tokens.append('[SEG]')
new_label_list.append('O')
assert len(tokens) == len(new_label_list)
rm_new_label_list = [i for i in new_label_list if i != 'O' and i != 'X']
rm_old_label_list = [i for i in old_label_list if i != 'O' and i != 'X']
assert len(rm_new_label_list) == len(rm_old_label_list)
f1.write(" ".join(new_label_list) + '\t' +
" ".join(tokens) + '\n')