Python源码示例:thulac.thulac()

示例1
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_data_path', default='../data/train.json', type=str, required=False, help='原始训练语料')
    parser.add_argument('--vocab_file', default='vocab_processed.txt', type=str, required=False, help='生成vocab链接')
    parser.add_argument('--vocab_size', default=50000, type=int, required=False, help='词表大小')
    args = parser.parse_args()

    lac = thulac.thulac(seg_only=True)
    tokenizer = Tokenizer(num_words=args.vocab_size)
    print('args:\n' + args.__repr__())
    print('This script is extremely slow especially for large corpus. Take a break.')

    f = open(args.raw_data_path, 'r')
    lines = json.load(f)
    for i, line in enumerate(tqdm(lines)):
        lines[i] = lac.cut(line, text=True)

    tokenizer.fit_on_texts(lines)
    vocab = list(tokenizer.index_word.values())
    pre = ['[SEP]', '[CLS]', '[MASK]', '[PAD]', '[UNK]']
    vocab = pre + vocab
    with open(args.vocab_file, 'w') as f:
        for word in vocab[:args.vocab_size + 5]:
            f.write(word + '\n') 
示例2
def return_cut_word_list_list(filepath):
	#return summary and short text[[],[],[]]
	list_summary , list_short_text = get_clean_data(filepath)
	thu_cut = thulac.thulac("-seg_only")
	list_word_short_text = []
	list_word_summary = []
	for i,short_text in enumerate(list_short_text):
		list_temp = thu_cut.cut(short_text)
		#if i%10000 == 0:
		#	print i," ".join(list_temp)
		list_word_short_text.append(list_temp)
	for i,summary in enumerate(list_summary):
		# list_temp = list(" ".join(thu_cut.cut(summary)))
		list_temp = thu_cut.cut(summary)
		#if i%10000 == 0:
		#	print i," ".join(list_temp)
		list_word_summary.append(list_temp)
	
	#print len(list_word_summary[0]), ' '.join(list_word_summary[0])
	return list_word_summary , list_word_short_text 
示例3
def preprocess(dataset: str):
    global segment_tool, dictionary
    print('Loading Segment Model...')
    segment_tool = thulac(rm_space=True)
    print('Loading dictionary')
    dictionary = set(map(lambda s: s.rstrip('\n'), open('dataset/dictionary.txt', encoding='utf-8').readlines()))

    dataset_list = (['train', 'test'], [dataset])
    for dataset_type, dataset_name in product(*dataset_list):
        with open('dataset/%s/%s_seg.txt' % (dataset_name, dataset_type), 'w', encoding='utf-8') as f:
            for line in handle_data('dataset/%s/%s.txt' % (dataset_name, dataset_type)):
                f.write(json.dumps(line, ensure_ascii=False) + '\n') 
示例4
def testSegOnly():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(seg_only = True)
	gold = thu.cut(test_text, text = True)
	assert gold == "我 爱 北京 天安门"

#由于Tag模型初始化耗时较大,在这里将两个Tag模型的测试放在一起 
示例5
def testTagAndDeli():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(deli = '#')
	gold = thu.cut(test_text, text = True)
	assert gold == "我#r 爱#v 北京#ns 天安门#ns" 
示例6
def testUserDict():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(seg_only = True, user_dict = prefix + "/userDict.txt")
	gold = thu.cut(test_text, text = True)
	assert gold == "我爱北京天安门" 
示例7
def testT2S():
	test_text = "我愛北京天安門"
	thu = thulac.thulac(seg_only = True, T2S = True)
	gold = thu.cut(test_text, text = True)
	print(gold)
	assert gold == "我 爱 北京 天安门" 
示例8
def testFilt():
	test_text = "我可以爱北京天安门"
	thu = thulac.thulac(seg_only = True, filt = True)
	gold = thu.cut(test_text, text = True)
	print(gold)
	assert gold == "我 爱 北京 天安门" 
示例9
def init_thulac(config):
    global cutter
    cutter = thulac.thulac(model_path=config.get("data","thulac"), seg_only=True, filt=False) 
示例10
def write_cut_word_to_file():
	#write summary and short text to file
	filepath = "./LCSTS/DATA/PART_I.txt"
	list_summary , list_short_text = get_clean_data(filepath)
	thu_cut = thulac.thulac("-seg_only")
	
	f_short_text = open("./LCSTS/DATA/PART_I_cut_short_text.txt","w+")
	f_summary = open("./LCSTS/DATA/PART_I_cut_summary.txt","w+")

	print len(list_summary),type(list_summary),len(list_short_text),type(list_short_text)
	for i,short_text in enumerate(list_short_text):
		list_temp = thu_cut.cut(short_text)
		try:
			content = " ".join(list_temp)
			#if i%5000 == 0:
			#	print i,content
		except:
			content = "wrong short text"
		f_short_text.write(content+"\n")
	
	f_short_text.close()

	for i,summary in enumerate(list_summary):
		list_temp = thu_cut.cut(summary)
		try:
			content = " ".join(list_temp)
			#if i%5000 == 0:
			#	print i,content
		except:
			content = "wrong summary"
		f_summary.write(content+"\n")
		
	f_summary.close() 
示例11
def createTable(num):
	start = time.time()
	thu = thulac.thulac()
	file = open('agri_economic.json', encoding='utf-8')
	print("begin!")
	table = set()
	f = json.load(file)
	count = 0
	file_text = ""
	for p in f:
		count += 1
		if int(count/2000) != num:
			continue
		if count % 10 == 0:
			cur = time.time()
			print("now id : " + str(count) + "  table size :" + str(len(table)))
			print("Running Time : " + str(int(cur-start)) + " s......")
		detail = p['detail']
#		if len(detail) > 600:
#			detail = detail[0:600]
		title = p['title']
		table.add(title)
		# 分词
		text = thu.cut(detail)
		table = table | createWordSet(text)
				
	for t in table:
		file_text += t+' '
	file_object = open('table'+str(num)+".txt",'w')
	file_object.write(file_text)
	file_object.close()

#createTable(0)
#createTable(1)
#createTable(2)
#createTable(3)
#createTable(4)
#createTable(5) 
示例12
def createTable(num):
	start = time.time()
	thu = thulac.thulac()
	file = open('agri_economic.json', encoding='utf-8')
	print("begin!")
	f = json.load(file)
	count = 0
	file_text = ""
	for p in f:
		count += 1
		if int(count/100) != num:
			continue
		if count % 10 == 0:
			cur = time.time()
			print("now id : " + str(count) + "  table size :" )
			print("Running Time : " + str(int(cur-start)) + " s......")
		detail = p['detail']
#		if len(detail) > 600:
#			detail = detail[0:600]
		title = p['title']
		# 分词
		text = thu.cut(detail)
		wordList = createWordList(text)
		file_text += title
		for word in wordList:
			file_text += ' ' + word
		file_text += '\n'
				
	file_object = open('article'+str(num)+".txt",'w')
	file_object.write(file_text)
	file_object.close() 
示例13
def cut_text(alltext):
	count = 0	
	cut = thulac.thulac(seg_only = True)
	train_text = []
	for text in alltext:
		count += 1
		if count % 2000 == 0:
			print(count)
		train_text.append(cut.cut(text, text = True))
	
	return train_text 
示例14
def __init__(self):
		self.tfidf = joblib.load('predictor/model/tfidf.model')
		self.law = joblib.load('predictor/model/law.model')
		self.accu = joblib.load('predictor/model/accu.model')
		self.time = joblib.load('predictor/model/time.model')
		self.batch_size = 1
		
		self.cut = thulac.thulac(seg_only = True) 
示例15
def get_NE(text):
    # 读取thulac,neo4j,分词
    thu1 = pre_load_thu

    TagList = thu1.cut(text, text=False)
    TagList.append(['===',None])  #末尾加个不合法的,后面好写
    
    # 读取实体类别,注意要和predict_labels.txt一个目录
    label = predict_labels
    
    answerList = []
    i = 0
    length = len(TagList) - 1 # 扣掉多加的那个
    while i < length:
        p1 = TagList[i][0]
        t1 = TagList[i][1]
        p2 = TagList[i+1][0]
        t2 = TagList[i+1][1]
        p12 = p1 + TagList[i+1][0]
        
        # 不但需要txt中有实体,还需要判断数据库中有没有
        if p12 in label and preok(t1) and nowok(t2):  # 组合2个词如果得到实体
            answerList.append([p12,label[p12]])
            i += 2
            continue
    
        if p1 in label and nowok(t1):     # 当前词如果是实体
            answerList.append([p1,label[p1]])
            i += 1
            continue
        
        if temporaryok(t1):
            answerList.append([p1,t1])
            i += 1
            continue
        
        answerList.append([p1,0])
        i += 1
    
    return answerList

#分句标识符号