Python源码示例:gensim.models.doc2vec.TaggedDocument()
示例1
def fit(self, texts):
model_param = {
"vector_size": self.vector_size,
"epochs": self.epochs,
"min_count": self.min_count,
"workers": self.n_jobs,
"window": self.window,
"dm_concat": self.dm_concat,
"dbow_words": self.dbow_words,
}
corpus = [TaggedDocument(simple_preprocess(text), [i])
for i, text in enumerate(texts)]
# If self.dm is 2, train both models and concatenate the feature
# vectors later. Resulting vector size should be the same.
if self.dm == 2:
model_param["vector_size"] = int(model_param["vector_size"]/2)
self.model_dm = _train_model(corpus, **model_param, dm=1)
self.model_dbow = _train_model(corpus, **model_param, dm=0)
else:
self.model = _train_model(corpus, **model_param, dm=self.dm)
示例2
def fit(self, graphs):
"""
Fitting a Graph2Vec model.
Arg types:
* **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
"""
self._set_seed()
self._check_graphs(graphs)
documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, self.attributed) for graph in graphs]
documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]
model = Doc2Vec(documents,
vector_size=self.dimensions,
window=0,
min_count=self.min_count,
dm=0,
sample=self.down_sampling,
workers=self.workers,
epochs=self.epochs,
alpha=self.learning_rate,
seed=self.seed)
self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]
示例3
def fit(self, graphs):
"""
Fitting a GL2Vec model.
Arg types:
* **graphs** *(List of NetworkX graphs)* - The graphs to be embedded.
"""
self._set_seed()
self._check_graphs(graphs)
graphs = [self._create_line_graph(graph) for graph in graphs]
documents = [WeisfeilerLehmanHashing(graph, self.wl_iterations, False) for graph in graphs]
documents = [TaggedDocument(words=doc.get_graph_features(), tags=[str(i)]) for i, doc in enumerate(documents)]
model = Doc2Vec(documents,
vector_size=self.dimensions,
window=0,
min_count=self.min_count,
dm=0,
sample=self.down_sampling,
workers=self.workers,
epochs=self.epochs,
alpha=self.learning_rate,
seed=self.seed)
self._embedding = [model.docvecs[str(i)] for i, _ in enumerate(documents)]
示例4
def _create_documents(self, walks, features):
"""
Accumulating the WL feature in neighbourhoods.
Arg types:
* **walks** *(list of lists)* - Random walks with string ids.
Return types:
* **new_features** *(list of TaggedDocument objects)* - The pooled features of nodes.
"""
new_features = {node: [] for node, feature in features.items()}
walks = self._transform_walks(walks)
for walk in walks:
for i in range(self.walk_length-self.window_size):
for j in range(self.window_size):
source = walk[i]
target = walk[i+j]
new_features[source].append(features[target])
new_features[target].append(features[source])
new_features = {node: [feature for features in new_features[node] for feature in features] for node, _ in new_features.items()}
new_features = [TaggedDocument(words=feature, tags=[str(node)]) for node, feature in new_features.items()]
return new_features
示例5
def train(args):
vocab = load_json(args.vocab)
# import pdb;pdb.set_trace()
# load corpus
corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
# corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
# corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
# corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)
d2v = MyDoc2Vec(args.n_dim, window=args.window_size, \
negative=args.negative, epoches=args.n_epoch, dm_concat=1)
start = timeit.default_timer()
d2v.train(corpus_iter)
print 'runtime: %ss' % (timeit.default_timer() - start)
save_doc2vec(d2v.model, args.save_model)
import pdb;pdb.set_trace()
示例6
def process_non_pooled_model_data(walks, counts, args):
"""
Function to extract proximity statistics.
:param walks: Diffusion lists.
:param counts: Number of nodes.
:param args: Arguments objects.
:return docs: Processed walks.
"""
print("Run feature extraction across windows.")
features = {str(node): [] for node in range(counts)}
for walk in tqdm(walks):
for i in range(len(walk)-args.window_size):
for j in range(1, args.window_size+1):
features[walk[i]].append(["+"+str(j)+"_"+walk[i+j]])
features[walk[i+j]].append(["_"+str(j)+"_"+walk[i]])
docs = [TaggedDocument(words=[x[0] for x in v], tags=[str(k)]) for k, v in features.items()]
return docs
示例7
def read_corpus(path = '.', exclude = [], targets = None):
i= 0
for file in os.listdir(path):
if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file
print(file)
with open(os.path.join(path, file), encoding="utf8") as document_text:
for line in document_text:
count = 0
words = simple_preprocess(line)
for word in words: # count the number of words with <= 3 characters
if len(word) <= 3:
count += 1
if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less
yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words
i+=1
if targets:
for key, val in targets.items():
yield(doc2vec.TaggedDocument(simple_preprocess(val), [i]))
i+=1
示例8
def tagcol_paragraph_embeddings_features(train_data):
# Expects a dataframe with a 'values' column
train_data_values = train_data['values']
columns = [TaggedDocument( random.sample(col, min(1000, len(col))) , [i]) for i, col in enumerate(train_data_values.values)]
return columns
# Input: returned tagged document collection from tagcol_paragraph_embeddings_features
# Only needed for training.
示例9
def _create_documents(self, features):
features_out = [TaggedDocument(words=[str(feat) for feat_elems in feature_set for feat in feat_elems], tags = [str(node)]) for node, feature_set in features.items()]
return features_out
示例10
def _create_base_docs(self):
features_out = [TaggedDocument(words=[str(feature) for feature in features], tags = [str(node)]) for node, features in self.features.items()]
return features_out
示例11
def create_documents(features):
"""
From a feature hash create a list of TaggedDocuments.
:param features: Feature hash table - keys are nodes, values are feature lists.
:return docs: Tagged Documents list.
"""
docs = [TaggedDocument(words=v, tags=[str(k)]) for k, v in features.items()]
return docs
示例12
def __iter__(self):
with open(self.fname, encoding='utf-8') as f:
for line in f:
try:
sentence, movie_id = line.strip().split("\u241E")
tokens = self.tokenizer.morphs(sentence)
tagged_doc = TaggedDocument(words=tokens, tags=['MOVIE_%s' % movie_id])
yield tagged_doc
except:
continue
示例13
def test(args):
vocab = load_json(args.vocab)
# load corpus
corpus = CorpusIter20News(args.corpus[0], recursive=True, stem=True, with_docname=True)
# corpus = CorpusIterMRD(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
# corpus = CorpusIterWiki10plus(args.corpus[0], load_json(args.docnames), stem=True, with_docname=True)
# corpus = CorpusIterReuters(args.corpus, load_json(args.docnames), with_docname=True)
corpus_iter = lambda: (TaggedDocument([word for word in sentence if word in vocab], tag) for sentence, tag in corpus)
d2v = load_doc2vec(args.load_model)
doc_codes = predict(d2v, corpus_iter)
dump_json(doc_codes, args.output)
import pdb;pdb.set_trace()
示例14
def main(script_folder, model_pickle_filename, training_algorithm, num_cores, epochs, vector_size, window, min_count, alpha, max_script_count, min_script_len, negative):
doc2vec_tagged_documents = list()
counter = 0
logger.info("retrieving files")
# Retrieve files containing Python scripts
# Altair's JSON format uses the 'content' label for the script code
for py_file in sorted(os.listdir(script_folder)):
if counter >= max_script_count: break
if counter % 100000 == 0: logger.info("processed %d files" % counter)
fullpath = os.path.join(script_folder, py_file)
with open(fullpath, "r") as py_file_contents:
for line in py_file_contents:
parsed_json = json.loads(line)
code, comments = separate_code_and_comments(parsed_json['content'],py_file)
if len(code) < min_script_len:
continue
else:
tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
counter += 1
doc2vec_model = build_doc2vec_model(doc2vec_tagged_documents,training_algorithm,num_cores,epochs,vector_size,window,min_count,alpha,negative)
# Per http://radimrehurek.com/gensim/models/doc2vec.html, delete_temporary_training_data reduces model size
# If keep_doctags_vectors is set to false, most_similar, similarity, sims is no longer available
# If keep_inference is set to false, infer_vector on a new document is no longer possible
doc2vec_model.delete_temporary_training_data(keep_doctags_vectors=False, keep_inference=True)
# Per http://radimrehurek.com/gensim/models/doc2vec.html, doc2vec has its own method for saving/loading models
# doc2vec_model.save(model_pickle_filename)
# doc2vec_model = doc2vec.Doc2Vec.load(model_pickle_filename)
#logger.info("saving doc2vec model in a pickle file at %s" % model_pickle_filename)
pickle.dump(doc2vec_model, open(model_pickle_filename, "wb"))
logger.info("doc2vec model pickle file saved at %s" % model_pickle_filename)
# Run this when called from CLI
示例15
def main(script_folder,output_folder,min_script_len,max_total_files,max_per_pkl):
doc2vec_tagged_documents = list()
counter = 0
logger.info("retrieving files")
just_started = True
# Retrieve files containing Python scripts
# Altair's JSON format uses the 'content' label for the script code
for py_file in sorted(os.listdir(script_folder)):
if counter>= max_total_files: break
fullpath = os.path.join(script_folder, py_file)
with open(fullpath, "r") as py_file_contents:
for line in py_file_contents:
if counter >= max_total_files: break
if counter!=0 and counter % 50000 == 0: logger.info("processed %d files" % counter)
if not just_started and counter % max_per_pkl == 0:
logger.info("Saving pickle file of tagged documents for size %d",max_per_pkl)
pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb"))
doc2vec_tagged_documents = list()
just_started = True
parsed_json = json.loads(line)
code, _ = separate_code_and_comments(parsed_json['content'],py_file)
if len(code) < min_script_len:
continue
else:
tokenized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True, remove_one_char_words=True)
if len(tokenized_code) > 1:
doc2vec_tagged_documents.append(doc2vec.TaggedDocument(tokenized_code, [counter]))
counter += 1
just_started = False
logger.info("Saving final pickle file of tagged documents for size %d",max_per_pkl)
pickle.dump(doc2vec_tagged_documents, open(os.path.join(output_folder,"training"+str(counter)+".pkl"), "wb"))
# Run this when called from CLI
示例16
def extract_instances(self, train_instances):
sentences = []
for idx, train_instance in enumerate(train_instances):
sa, sb = train_instance.get_word(type='lemma', lower=True)
sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx]))
sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx]))
model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000)
features = []
infos = []
for idx in range(len(train_instances)):
vec_a = model.docvecs['sa_%d' % idx]
vec_b = model.docvecs['sb_%d' % idx]
feature, info = vk.get_all_kernel(vec_a, vec_b)
features.append(feature)
infos.append([])
# infos.append([vec_a, vec_b])
return features, infos
# def load_instances(self, train_instances):
# """
# extract cosine distance from already trained feature file
# without modify the feature_file
# this function's priority is higher that the above extract_instances
# """
#
# _features, _n_dim, _n_instance = Feature.load_feature_from_file(self.feature_file)
# features = []
# infos = []
# ''' get features from train instances'''
# for _feature in _features:
# feature = Feature._feat_string_to_list(_feature, _n_dim)
# features.append([feature[1]])
# infos.append(['cosine'])
#
# features = [ Feature._feat_list_to_string(feature) for feature in features ]
#
# return features, 1, _n_instance
示例17
def fit(self, docs, y):
assert len(docs) == len(y)
model = self.model
n_epochs = self.n_epochs
verbose = self.verbose
decay = (self.alpha - self.min_alpha) / n_epochs
X = [TaggedDocument(self.analyzer(doc), [label])
for doc, label in zip(docs, y)]
if verbose > 0:
print("First 3 tagged documents:\n", X[:3])
print("Training doc2vec model")
# d2v = Doc2Vec()
# d2v.build_vocab(X)
# if self.intersect is not None:
# d2v.intersect_word2vec_format(self.intersect)
model.build_vocab(X)
for epoch in range(n_epochs):
if verbose:
print("Doc2Vec: Epoch {} of {}.".format(epoch + 1, n_epochs))
model.train(X)
model.alpha -= decay # apply global decay
model.min_alpha = model.alpha # but no decay inside one epoch
if verbose > 0:
print("Finished.")
print("model:", self.model)
if self._matching:
self._matching.fit(docs)
else:
# if we dont do matching, its enough to fit a nearest neighbors on
# all centroids before query time
dvs = np.asarray([model.docvecs[tag] for tag in y])
self._neighbors.fit(dvs)
self._y = y
return self
示例18
def test_doc2vec_inference():
tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, epochs=1, min_count=1)
d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
match_op = Matching()
retrieval = Retrieval(d2v, matching=match_op).fit(documents)
result = retrieval.query("scientists")
assert result[0] == 1
示例19
def test_doc2vec_inference_saveload():
tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
for i, doc in enumerate(documents)]
model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
model.save(TEST_FILE)
del model
model = Doc2Vec.load(TEST_FILE)
os.remove(TEST_FILE)
d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
match_op = Matching()
retrieval = Retrieval(d2v, matching=match_op).fit(documents)
result = retrieval.query("scientists")
assert result[0] == 1
示例20
def __iter__(self):
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
yield TaggedDocument(utils.to_unicode(line).split(),
[prefix + '_%s' % item_no])
示例21
def to_array(self):
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
self.sentences.append(
TaggedDocument(utils.to_unicode(line).split(),
[prefix + '_%s' % item_no]))
return self.sentences
示例22
def feature_extractor(data, rounds, name):
graph = nx.from_edgelist(np.array(data.edge_index.T.cpu(), dtype=int))
if data.x is not None:
feature = {int(key): str(val) for key, val in enumerate(np.array(data.x.cpu()))}
else:
feature = dict(nx.degree(graph))
graph_wl_features = Graph2Vec.wl_iterations(graph, feature, rounds)
doc = TaggedDocument(words=graph_wl_features, tags=["g_" + name])
return doc
示例23
def _set_tagged(self):
"""set self._set_tagged to list[TaggedDocument] each TaggedDocument has a tag of [index]"""
print("listing tagged documents in memory")
self._tagged = [TaggedDocument(doc, tags=[index]) for index, doc in enumerate(self._tokenized)]
示例24
def __iter__(self):
deck = []
for line in open(self.filename, encoding="utf-8"):
deck.append(line)
if len(deck) >= 10000000:
shuffle(deck)
for card in deck:
csv = card.split(",")
subreddit = csv[0]
body = csv[1].split()
yield TaggedDocument(words=body, tags=[subreddit, clusterLabel[subreddit]])
deck = []
示例25
def transform(self, texts):
corpus = [TaggedDocument(simple_preprocess(text), [i])
for i, text in enumerate(texts)]
if self.dm == 2:
X_dm = _transform_text(self.model_dm, corpus)
X_dbow = _transform_text(self.model_dbow, corpus)
X = np.concatenate((X_dm, X_dbow), axis=1)
else:
X = _transform_text(self.model, corpus)
return X
示例26
def feature_extractor(path, rounds):
"""
Function to extract WL features from a graph.
:param path: The path to the graph json.
:param rounds: Number of WL iterations.
:return doc: Document collection object.
"""
graph, features, name = dataset_reader(path)
machine = WeisfeilerLehmanMachine(graph, features, rounds)
doc = TaggedDocument(words=machine.extracted_features, tags=["g_" + name])
return doc
示例27
def create_documents(features):
"""
Created tagged documents object from a dictionary.
:param features: Keys are document ids and values are strings of the document.
:return docs: List of tagged documents.
"""
docs = [TaggedDocument(words=v, tags=[str(k)]) for k, v in features.items()]
return docs