Python源码示例:bert.tokenization.BasicTokenizer()

示例1
def __init__(self):
        self._tokenizer = BasicTokenizer(do_lower_case=False) 
示例2
def test_chinese(self):
        tokenizer = tokenization.BasicTokenizer()

        self.assertAllEqual(
            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
            [u"ah", u"\u535A", u"\u63A8", u"zz"]) 
示例3
def test_basic_tokenizer_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
        self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 
示例4
def test_basic_tokenizer_no_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"]) 
示例5
def test_chinese(self):
    tokenizer = tokenization.BasicTokenizer()

    self.assertAllEqual(
        tokenizer.tokenize(u"ah\u535A\u63A8zz"),
        [u"ah", u"\u535A", u"\u63A8", u"zz"]) 
示例6
def test_basic_tokenizer_lower(self):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

    self.assertAllEqual(
        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
        ["hello", "!", "how", "are", "you", "?"])
    self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 
示例7
def test_basic_tokenizer_no_lower(self):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

    self.assertAllEqual(
        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
        ["HeLLo", "!", "how", "Are", "yoU", "?"]) 
示例8
def build_wiki_corpus(n_processes):
    build_dataset("wiki", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
                      train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
                      test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
示例9
def build_web_corpus(n_processes):
    build_dataset("web", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "web-dev.json"),
                      train=join(TRIVIA_QA, "qa", "web-train.json"),
                      test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
示例10
def build_unfiltered_corpus(n_processes):
    build_dataset("unfiltered", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
                      train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
                      test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
示例11
def build_wiki_sample_corpus(n_processes):
    build_dataset("wiki-sample", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
                      train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
                      test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes, sample=20) 
示例12
def build_unfiltered_sample_corpus(n_processes):
    build_dataset("unfiltered-sample", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
                      train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
                      test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes, sample=20) 
示例13
def main():
    parse = argparse.ArgumentParser("Pre-tokenize the TriviaQA evidence corpus")
    parse.add_argument("-o", "--output_dir", type=str, default=join("data", "triviaqa", "evidence"))
    parse.add_argument("-s", "--source", type=str, default=join(TRIVIA_QA, "evidence"))
    # This is slow, using more processes is recommended
    parse.add_argument("-n", "--n_processes", type=int, default=1, help="Number of processes to use")
    parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
    parse.add_argument("--wiki_only", action="store_true")
    args = parse.parse_args()

    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    splitter = MergeParagraphs(args.max_tokens)
    build_tokenized_corpus(args.source, tokenizer, splitter, args.output_dir,
                           n_processes=args.n_processes, wiki_only=args.wiki_only) 
示例14
def main():
    parse = argparse.ArgumentParser("Pre-tokenize the SQuAD open dev file")
    parse.add_argument("--input_file", type=str, default=join("data", "squad", "squad_dev_open.pkl"))
    # This is slow, using more processes is recommended
    parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
    parse.add_argument("--n_to_select", type=int, default=30, help="Number of paragraphs to retrieve")
    parse.add_argument("--sort_passage", type=bool, default=True, help="Sort passage according to order")
    parse.add_argument("--debug", type=bool, default=False, help="Whether to run in debug mode")
    args = parse.parse_args()

    dev_examples = pickle.load(open(args.input_file, 'rb'))

    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    splitter = MergeParagraphs(args.max_tokens)
    tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=stop_words)
    detector = FastNormalizedAnswerDetector()

    ir_count, total_doc_length, pruned_doc_length = 0, 0, 0
    out = []
    for example_ix, example in tqdm(enumerate(dev_examples), total=len(dev_examples)):
        paras = [x for x in example.doc_text.split("\n") if len(x) > 0]
        paragraphs = [tokenizer.tokenize(x) for x in paras]
        merged_paragraphs = splitter.merge(paragraphs)

        scores = rank(tfidf, [example.question_text], [" ".join(x) for x in merged_paragraphs])
        para_scores = scores[0]
        para_ranks = np.argsort(para_scores)
        selection = [i for i in para_ranks[:args.n_to_select]]

        if args.sort_passage:
            selection = np.sort(selection)

        doc_tokens = []
        for idx in selection:
            current_para = merged_paragraphs[idx]
            doc_tokens += current_para

        tokenized_answers = [tokenizer.tokenize(x) for x in example.answer_texts]
        detector.set_question(tokenized_answers)
        if len(detector.any_found(doc_tokens)) > 0:
            ir_count += 1

        total_doc_length += sum(len(para) for para in merged_paragraphs)
        pruned_doc_length += len(doc_tokens)

        out.append(DocumentAndQuestion(example_ix, example.qas_id, example.question_text, doc_tokens,
                                       '', 0, 0, True))
        if args.debug and example_ix > 5:
            break
    print("Recall of answer existence in documents: {:.3f}".format(ir_count / len(out)))
    print("Average length of documents: {:.3f}".format(total_doc_length / len(out)))
    print("Average pruned length of documents: {:.3f}".format(pruned_doc_length / len(out)))
    output_file = join("data", "squad", "eval_open_{}paras_examples.pkl".format(args.n_to_select))
    pickle.dump(out, open(output_file, 'wb'))