Python源码示例:stanfordcorenlp.StanfordCoreNLP()
示例1
def load_simpq(data_dir):
LogInfo.logs('SimpQ initializing ... ')
qa_list = []
corenlp = StanfordCoreNLP(CORENLP_PATH)
for Tvt in ('train', 'valid', 'test'):
fp = '%s/annotated_fb_data_%s.txt' % (data_dir, Tvt)
with codecs.open(fp, 'r', 'utf-8') as br:
for line in br.readlines():
qa = {}
s, p, o, q = line.strip().split('\t')
s = _remove_simpq_header(s)
p = _remove_simpq_header(p)
o = _remove_simpq_header(o)
qa['utterance'] = q
qa['targetValue'] = (s, p, o) # different from other datasets
qa['tokens'] = corenlp.word_tokenize(qa['utterance'])
qa['parse'] = corenlp.dependency_parse(qa['utterance'])
qa_list.append(qa)
if len(qa_list) % 1000 == 0:
LogInfo.logs('%d scanned.', len(qa_list))
pickle_fp = '%s/simpQ.data.pkl' % data_dir
with open(pickle_fp, 'wb') as bw:
pickle.dump(qa_list, bw)
LogInfo.logs('%d SimpleQuestions loaded.' % len(qa_list))
return qa_list
示例2
def load_reddit(data_dir, mode='train'):
LogInfo.logs('Reddit initializing ... ')
dg_list = []
corenlp = StanfordCoreNLP(CORENLP_PATH)
fp = '%s/%s_v3.txt' % (data_dir, mode)
with open(fp, 'r') as br:
for line in br:
dg_line = json.loads(line)
dialog = {'utterance': dg_line['post'].strip(),
'tokens': dg_line['post'].split(),
'parse': corenlp.dependency_parse(dg_line['post']),
'response': dg_line['response'].strip(),
'corr_responses': dg_line['corr_responses'],
'all_triples': dg_line['all_triples'],
'all_entities': dg_line['all_entities']
}
dg_list.append(dialog)
if len(dg_list) % 10000 == 0:
LogInfo.logs('%d scanned.', len(dg_list))
pickle_fp = '%s/Reddit.%s.pkl' % (data_dir, mode)
with open(pickle_fp, 'wb') as bw:
pickle.dump(dg_list, bw)
LogInfo.logs('%d Reddit saved in [%s].' % (len(dg_list), pickle_fp))
return dg_list
示例3
def load_stanford_core_nlp(path):
from stanfordcorenlp import StanfordCoreNLP
"""
Load stanford core NLP toolkit object
args:
path: String
output:
Stanford core NLP objects
"""
zh_nlp = StanfordCoreNLP(path, lang='zh')
en_nlp = StanfordCoreNLP(path, lang='en')
return zh_nlp, en_nlp
示例4
def test_args(self):
self.assertRaises(IOError, StanfordCoreNLP, '/abc')
self.assertRaises(ValueError, StanfordCoreNLP, r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/',
lang='abc')
self.assertRaises(ValueError, StanfordCoreNLP, r'G:/JavaLibraries/stanford-corenlp-full-2016-10-31/',
memory='4m')
示例5
def __init__(self, url_or_path, port = 9000):
"""Initialize stanford core nlp tokenier.
Args:
url_or_path: Url string of path string of Stanford CoreNLP library.
Provide url string if you already stand up Stanford CoreNLP server.
If not, provide path to directory of library i.e. JavaLibraries/stanford-corenlp-full-2017-06-09/.
When you provide path of librart, Stanford CoreNLP server will be up independent of python process.
"""
self.tokenizer = StanfordCoreNLP(url_or_path, port = port)
示例6
def __init__(self, params):
"""
A simple NLP helper class.
Args:
params(dict): A dict containing some parameters.
"""
self.params = params
self.corenlp = StanfordCoreNLP(self.params['corenlp_path'], quiet=False)
# Pre-fetching the required models.
props = {'annotators': 'coref', 'pipelineLanguage': 'en', 'ner.useSUTime': False}
self.corenlp.annotate('', properties=props)