Python源码示例:bert.tokenization.printable_text()
示例1
def __repr__(self):
s = ""
s += "document_id: %s" % (self.document_id)
s += ", qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: %s ..." % (" ".join(self.doc_tokens[:20]))
s += ", length of doc_tokens: %d" % (len(self.doc_tokens))
if self.orig_answer_texts:
s += ", orig_answer_texts: {}".format(self.orig_answer_texts)
if self.start_positions and self.end_positions:
s += ", start_positions: {}".format(self.start_positions)
s += ", end_positions: {}".format(self.end_positions)
s += ", token_answer: "
for start, end in zip(self.start_positions, self.end_positions):
s += "{}, ".format(" ".join(self.doc_tokens[start:(end+1)]))
return s
示例2
def __repr__(self):
s = ""
s += "id: %s" % (self.qid)
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_positions: %s" % (self.start_position)
if self.start_position:
s += ", end_positions: %s" % (self.end_position)
return s
示例3
def __repr__(self):
s = ""
s += "id: %s" % (self.qid)
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
s += ", answer_text: %s" % (self.orig_answer_text)
return s
示例4
def __str__(self):
s = ""
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.tokens]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "is_random_next: %s\n" % self.is_random_next
s += "masked_lm_positions: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_positions]))
s += "masked_lm_labels: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.masked_lm_labels]))
s += "\n"
return s
示例5
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
return s
示例6
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", \nquestion: %s" % (" ".join(self.question_tokens))
s += ", \npassage: %s" % (" ".join(self.passage_tokens))
if self.numbers_in_passage:
s += ", \nnumbers_in_passage: {}".format(self.numbers_in_passage)
if self.number_indices:
s += ", \nnumber_indices: {}".format(self.number_indices)
if self.answer_type:
s += ", \nanswer_type: {}".format(self.answer_type)
if self.number_of_answer:
s += ", \nnumber_of_answer: {}".format(self.number_of_answer)
if self.passage_spans:
s += ", \npassage_spans: {}".format(self.passage_spans)
if self.question_spans:
s += ", \nquestion_spans: {}".format(self.question_spans)
if self.add_sub_expressions:
s += ", \nadd_sub_expressions: {}".format(self.add_sub_expressions)
if self.counts:
s += ", \ncounts: {}".format(self.counts)
if self.negations:
s += ", \nnegations: {}".format(self.negations)
if self.answer_annotations:
s += ", \nanswer_annotations: {}".format(self.answer_annotations)
return s
示例7
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
示例8
def __str__(self):
s = ""
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.tokens]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "is_random_next: %s\n" % self.is_random_next
s += "masked_lm_positions: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_positions]))
s += "masked_lm_labels: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in self.masked_lm_labels]))
s += "\n"
return s
示例9
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
示例10
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.label_id:
s += ", membership label_id: %d" % (self.label_id)
return s
示例11
def __str__(self):
s = ""
for sent in self.tokens[0]:
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in sent]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids[0]]))
s += "\n"
return s
示例12
def __str__(self):
s = ""
for sent in self.tokens:
s += "tokens: %s\n" % (" ".join(
[tokenization.printable_text(x) for x in sent]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "\n"
return s
示例13
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
示例14
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
示例15
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
示例16
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
示例17
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
示例18
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
return s
示例19
def __repr__(self):
s = ""
# s += "example_id: %s" % (tokenization.printable_text(self.example_id))
s += ", sent_tokens: [%s]" % (" ".join(self.sent_tokens))
if self.term_texts:
s += ", term_texts: {}".format(self.term_texts)
# if self.start_positions:
# s += ", start_positions: {}".format(self.start_positions)
# if self.end_positions:
# s += ", end_positions: {}".format(self.end_positions)
if self.polarities:
s += ", polarities: {}".format(self.polarities)
return s
示例20
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
if self.start_position:
s += ", is_impossible: %r" % (self.is_impossible)
return s
示例21
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
return s
示例22
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += "doc_index: %d" % (self.doc_index)
s += "para_index: %d" % (self.para_index)
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
if self.answer_texts is not None:
s += ", answer_texts: ".format(self.answer_texts)
return s
示例23
def __repr__(self):
s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.start_position:
s += ", end_position: %d" % (self.end_position)
return s
示例24
def __repr__(self):
s = ""
s += "document_id: %s" % (self.document_id)
s += ", qas_id: %s" % (tokenization.printable_text(self.qas_id))
s += ", question_text: %s" % (
tokenization.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens[:20]))
s += ", length of doc_tokens: [%d]" % (len(self.doc_tokens))
if self.start_position:
s += ", start_position: %d" % (self.start_position)
if self.end_position:
s += ", end_position: %d" % (self.end_position)
return s
示例25
def convert_examples_to_features(examples,label_list, max_seq_length,tokenizer):
"""
将所有的InputExamples样本数据转化成模型要输入的token形式,最后输出bert模型需要的四个变量;
input_ids:就是text_a(分类文本)在词库对应的token,按字符级;
input_mask:bert模型mask训练的标记,都为1;
segment_ids:句子标记,此场景只有text_a,都为0;
label_ids:文本标签对应的token,不是one_hot的形式;
"""
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
input_data=[]
for (ex_index, example) in enumerate(examples):
tokens_a = tokenizer.tokenize(example.text_a)
if ex_index % 10000 == 0:
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
label_id = label_map[example.label]
if ex_index < 3:
tf.logging.info("*** Example ***")
tf.logging.info("guid: %s" % (example.guid))
tf.logging.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
features = collections.OrderedDict()
features["input_ids"] = input_ids
features["input_mask"] = input_mask
features["segment_ids"] = segment_ids
features["label_ids"] =label_id
input_data.append(features)
return input_data
示例26
def convert_examples_to_features(examples, tokenizer, max_query_length,
entity2id, output_fn):
"""Loads a data file into a list of `InputBatch`s."""
for (example_index, example) in tqdm(enumerate(examples)):
qry_input_ids, qry_input_mask, qry_tokens = get_tokens_and_mask(
example.question_text, tokenizer, max_query_length)
relation_input_ids, relation_input_mask = [], []
if example.relations is not None:
for relation in example.relations:
rel_input_ids, rel_input_mask, _ = get_tokens_and_mask(
relation, tokenizer, max_query_length)
relation_input_ids.append(rel_input_ids)
relation_input_mask.append(rel_input_mask)
if example_index < 20:
tf.logging.info("*** Example ***")
tf.logging.info("unique_id: %s", example.qas_id)
tf.logging.info(
"qry_tokens: %s",
" ".join([tokenization.printable_text(x) for x in qry_tokens]))
tf.logging.info("qry_input_ids: %s",
" ".join([str(x) for x in qry_input_ids]))
tf.logging.info("qry_input_mask: %s",
" ".join([str(x) for x in qry_input_mask]))
for ii in range(len(relation_input_ids)):
tf.logging.info("relation_input_ids_%d: %s", ii,
" ".join([str(x) for x in relation_input_ids[ii]]))
tf.logging.info("relation_input_mask_%d: %s", ii,
" ".join([str(x) for x in relation_input_mask[ii]]))
tf.logging.info("qry_entity_id: %s (%d)", example.subject_entity[0],
entity2id.get(example.subject_entity[0], None))
tf.logging.info("answer entity: %s", str(example.answer_entity))
feature = InputFeatures(
qas_id=example.qas_id.encode("utf-8"),
qry_tokens=qry_tokens,
qry_input_ids=qry_input_ids,
qry_input_mask=qry_input_mask,
relation_input_ids=relation_input_ids,
relation_input_mask=relation_input_mask,
qry_entity_id=[entity2id.get(ee, 0) for ee in example.subject_entity],
answer_mention=example.answer_mention,
answer_entity=example.answer_entity,
bridge_mention=example.bridge_mention,
bridge_entity=example.bridge_entity)
# Run callback
output_fn(feature)