Python源码示例:annoy.AnnoyIndex()
示例1
def nn_approx(ds1, ds2, knn=KNN, metric='manhattan', n_trees=10):
# Build index.
a = AnnoyIndex(ds2.shape[1], metric=metric)
for i in range(ds2.shape[0]):
a.add_item(i, ds2[i, :])
a.build(n_trees)
# Search index.
ind = []
for i in range(ds1.shape[0]):
ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
ind = np.array(ind)
# Match.
match = set()
for a, b in zip(range(ds1.shape[0]), ind):
for b_i in b:
match.add((a, b_i))
return match
# Find mutual nearest neighbors.
示例2
def label_approx(X, sites, site_labels, k=1):
from annoy import AnnoyIndex
assert(X.shape[1] == sites.shape[1])
# Build index over site points.
aindex = AnnoyIndex(sites.shape[1], metric='euclidean')
for i in range(sites.shape[0]):
aindex.add_item(i, sites[i, :])
aindex.build(10)
labels = []
for i in range(X.shape[0]):
# Find nearest site point.
nearest_sites = aindex.get_nns_by_vector(X[i, :], k)
if len(nearest_sites) < 1:
labels.append(None)
continue
label = Counter([
site_labels[ns] for ns in nearest_sites
]).most_common(1)[0][0]
labels.append(label)
return np.array(labels)
示例3
def get_approx_index_chunks(self):
"""Gets decompressed chunks of the AnnoyIndex of the vectors from
the database."""
try:
db = self._db(force_new=True)
with lz4.frame.LZ4FrameDecompressor() as decompressor:
chunks = db.execute(
"""
SELECT rowid,index_file
FROM `magnitude_approx`
WHERE trees = ?
""", (self.approx_trees,))
for chunk in chunks:
yield decompressor.decompress(chunk[1])
if self.closed:
return
except Exception as e:
if self.closed:
pass
else:
raise e
示例4
def get_approx_index_chunks(self):
"""Gets decompressed chunks of the AnnoyIndex of the vectors from
the database."""
try:
db = self._db(force_new=True)
with lz4.frame.LZ4FrameDecompressor() as decompressor:
chunks = db.execute(
"""
SELECT rowid,index_file
FROM `magnitude_approx`
WHERE trees = ?
""", (self.approx_trees,))
for chunk in chunks:
yield decompressor.decompress(chunk[1])
if self.closed:
return
except Exception as e:
if self.closed:
pass
else:
raise e
示例5
def get_approx_index_chunks(self):
"""Gets decompressed chunks of the AnnoyIndex of the vectors from
the database."""
try:
db = self._db(force_new=True)
with lz4.frame.LZ4FrameDecompressor() as decompressor:
chunks = db.execute(
"""
SELECT rowid,index_file
FROM `magnitude_approx`
WHERE trees = ?
""", (self.approx_trees,))
for chunk in chunks:
yield decompressor.decompress(chunk[1])
if self.closed:
return
except Exception as e:
if self.closed:
pass
else:
raise e
示例6
def run(self):
try:
index = AnnoyIndex(self.n_dims, metric='angular')
index.load(self.index_filepath)
for i in range(self.data_indices[0], self.data_indices[1]):
neighbour_indexes = index.get_nns_by_item(
i, self.k, search_k=self.search_k, include_distances=False)
neighbour_indexes = np.array(neighbour_indexes,
dtype=np.uint32)
self.results_queue.put(
IndexNeighbours(row_index=i,
neighbour_list=neighbour_indexes))
except Exception as e:
self.exception = e
finally:
self.results_queue.close()
示例7
def test_build_sparse_annoy_index(annoy_index_file):
data = np.random.choice([0, 1], size=(10, 5))
sparse_data = csr_matrix(data)
index = build_annoy_index(sparse_data, annoy_index_file)
assert os.path.exists(annoy_index_file)
loaded_index = AnnoyIndex(5, metric='angular')
loaded_index.load(annoy_index_file)
assert index.f == loaded_index.f == 5
assert index.get_n_items() == loaded_index.get_n_items() == 10
assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)
index.unload()
loaded_index.unload()
示例8
def create_tree(data,approx,metric,use_faiss,n_trees):
'''
Create a faiss/cKDTree/KDTree/annoy index for nearest neighbour lookup. All undescribed input
as in ``bbknn.bbknn()``. Returns the resulting index.
Input
-----
data : ``numppy.array``
PCA coordinates of a batch's cells to index.
'''
if approx:
ckd = AnnoyIndex(data.shape[1],metric=metric)
for i in np.arange(data.shape[0]):
ckd.add_item(i,data[i,:])
ckd.build(n_trees)
elif metric == 'euclidean':
if 'faiss' in sys.modules and use_faiss:
ckd = faiss.IndexFlatL2(data.shape[1])
ckd.add(data)
else:
ckd = cKDTree(data)
else:
ckd = KDTree(data,metric=metric)
return ckd
示例9
def srs_positive_annoy(X, N, seed=None, replace=False, prenormalized=False):
from annoy import AnnoyIndex
n_samples, n_features = X.shape
if not replace and N > n_samples:
raise ValueError('Cannot sample {} elements from {} elements '
'without replacement'.format(N, n_samples))
if not replace and N == n_samples:
return range(N)
if not seed is None:
np.random.seed(seed)
X = X - X.min(0)
if not prenormalized:
X = normalize(X).astype('float32')
srs_idx = set()
for i in range(N):
aindex = AnnoyIndex(X.shape[1], metric='euclidean')
for i in range(X.shape[0]):
if i not in srs_idx:
aindex.add_item(i, X[i, :])
aindex.build(10)
Phi_i = np.random.normal(size=(n_features))
Phi_i /= np.linalg.norm(Phi_i)
nearest_site = aindex.get_nns_by_vector(Phi_i, 1)
srs_idx.add(nearest_site[0])
return sorted(srs_idx)
示例10
def fit(self, Ciu, show_progress=True):
# delay loading the annoy library in case its not installed here
import annoy
# train the model
super(AnnoyAlternatingLeastSquares, self).fit(Ciu, show_progress)
# build up an Annoy Index with all the item_factors (for calculating
# similar items)
if self.approximate_similar_items:
log.debug("Building annoy similar items index")
self.similar_items_index = annoy.AnnoyIndex(
self.item_factors.shape[1], 'angular')
for i, row in enumerate(self.item_factors):
self.similar_items_index.add_item(i, row)
self.similar_items_index.build(self.n_trees)
# build up a separate index for the inner product (for recommend
# methods)
if self.approximate_recommend:
log.debug("Building annoy recommendation index")
self.max_norm, extra = augment_inner_product_matrix(self.item_factors)
self.recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular')
for i, row in enumerate(extra):
self.recommend_index.add_item(i, row)
self.recommend_index.build(self.n_trees)
示例11
def generateAnnoy(real, artificial, annoyFilename, dimensions):
idx2vec = np.array(artificial[2])
t = AnnoyIndex(dimensions)
for j in range(len(artificial[2])):
t.add_item(j,idx2vec[j])
print('Done Adding items to AnnoyIndex')
t.build(TREESIZE)
print('Done Building AnnoyIndex')
t.save(annoyFilename)
return t
示例12
def create_annoy(target_features):
t = AnnoyIndex(layer_dimension)
for idx, target_feature in enumerate(target_features):
t.add_item(idx, target_feature)
t.build(10)
t.save(os.path.join(work_dir, 'annoy.ann'))
示例13
def test_tree(self):
t = AnnoyIndex(5, 'angular')
t.add_item(1, [1,2,3,4,5])
self.assertTrue(t.build(1))
示例14
def __build_index(self, index_file):
self.embedding_size = self.embeddings.shape[1]
self.index = an.AnnoyIndex(self.embedding_size, metric='angular')
for embedding_ind in range(self.embeddings.shape[0]):
embedding = self.embeddings[embedding_ind, :]
self.index.add_item(embedding_ind, embedding)
self.index.build(self.n_trees)
if self.id_map is None:
self.id_map = dict([(i, i) for i in range(self.embeddings.shape[0])])
self.inverse_id_map = dict([(v,k) for k,v in self.id_map.items()])
if index_file:
embeddings_file = index_file + '.embeddings'
state = {
'embedding_size': self.embedding_size,
'id_map': self.id_map,
}
self.index.save(embeddings_file)
with open(index_file, 'wb') as _index_file:
pickle.dump(state, _index_file)
示例15
def __load_index(self, index_file):
log.info('Loading index file from {}'.format(index_file))
with open(index_file, 'rb') as _index_file:
state = pickle.load(_index_file)
self.embedding_size = state['embedding_size']
self.id_map = state['id_map']
embeddings_file = index_file + '.embeddings'
self.index = an.AnnoyIndex(self.embedding_size, metric='angular')
self.index.load(embeddings_file)
self.inverse_id_map = dict([(v,k) for k,v in self.id_map.items()])
示例16
def __init__(self, vecs):
assert len(vecs)>0, 'no vecs available to init AnnoyIndex'
size = len(vecs[0])
self.annoy_model = AnnoyIndex(size)
for idx,vec in enumerate(vecs):
self.annoy_model.add_item(idx, vec)
self.annoy_model.build(50)
示例17
def __init__(self, file_name, dim_vector=500):
self.u = AnnoyIndex(dim_vector)
self.u.load(file_name)
示例18
def build_annoy_index(encoded, outfile):
input_shape = encoded.shape
f = input_shape[1]
t = AnnoyIndex(f, metric='angular') # Length of item vector that will be indexed
for i,v in enumerate(encoded):
t.add_item(i, v)
t.build(100) # 10 trees
if outfile is not None:
t.save(outfile)
return t
示例19
def load_annoy_index(infile, z_dim):
t = AnnoyIndex(z_dim)
t.load(infile) # super fast, will just mmap the file
return t
示例20
def test():
from annoy import AnnoyIndex
import random
f = 40
t = AnnoyIndex(f) # Length of item vector that will be indexed
for i in range(1000):
v = [random.gauss(0, 1) for z in range(f)]
t.add_item(i, v)
t.build(10) # 10 trees
print("test-py-module passed...")
示例21
def test():
from annoy import AnnoyIndex
import random
f = 40
t = AnnoyIndex(f) # Length of item vector that will be indexed
for i in range(1000):
v = [random.gauss(0, 1) for z in range(f)]
t.add_item(i, v)
t.build(10) # 10 trees
print("test-py-module passed...")
示例22
def build_annoy_tree(facial_embeddings, tree_path,
annoy_metric='euclidean', annoy_trees_no=256):
"""
Builds an annoy tree
Args:
facial_embeddings: List of facial embeddings to be indexed in tree
tree_path: where the annoy tree will be saved
annoy_metric: euclidean / angular
annoy_tree_no: how many trees in the annoy forest? Larger = more accurate
"""
# Annoy tree
tree = AnnoyIndex(128, metric=annoy_metric)
# Don't wanna store entire db into memory
for idx, f in enumerate(tqdm(facial_embeddings)):
# Sqlte errors sometimes?
try:
cur_np = string_to_np(f.latent_space)
tree.add_item(idx, cur_np)
except Exception as e:
tqdm.write(str(e))
tree.build(annoy_trees_no)
tree.save(tree_path)
示例23
def get_approx_index_chunks(self):
"""Gets decompressed chunks of the AnnoyIndex of the vectors from
the database."""
try:
db = self._db(force_new=True, downloader=True)
num_chunks = db.execute(
"""
SELECT COUNT(rowid)
FROM `magnitude_approx`
WHERE trees = ?
""", (self.approx_trees,)).fetchall()[0][0]
with lz4.frame.LZ4FrameDecompressor() as decompressor:
chunks = db.execute(
"""
SELECT rowid,index_file
FROM `magnitude_approx`
WHERE trees = ?
""", (self.approx_trees,))
for chunk in chunks:
yield num_chunks, decompressor.decompress(chunk[1])
if self.closed:
return
except Exception as e:
if self.closed:
pass
else:
raise e
示例24
def get_approx_index(self):
"""Gets an AnnoyIndex of the vectors from the database."""
chunks = self.get_approx_index_chunks()
if self._approx_index is None:
while True:
if not self.setup_for_mmap:
self._setup_for_mmap()
try:
approx_index = AnnoyIndex(self.emb_dim, metric='angular')
approx_index.load(self.path_to_approx_mmap)
self._approx_index = approx_index
break
except BaseException:
path_to_approx_mmap_temp = self.path_to_approx_mmap \
+ '.tmp'
tlock = self.APPROX_MMAP_THREAD_LOCK.acquire(False)
plock = self.APPROX_MMAP_PROCESS_LOCK.acquire(0)
if tlock and plock:
try:
with open(path_to_approx_mmap_temp, "w+b") \
as mmap_file:
for chunk in chunks:
mmap_file.write(chunk)
if not self.closed:
os.rename(path_to_approx_mmap_temp,
self.path_to_approx_mmap)
else:
return
finally:
self.APPROX_MMAP_THREAD_LOCK.release()
try:
self.APPROX_MMAP_PROCESS_LOCK.release()
except BaseException:
pass
sleep(1) # Block before trying again
return self._approx_index
示例25
def get_approx_index(self):
"""Gets an AnnoyIndex of the vectors from the database."""
chunks = self.get_approx_index_chunks()
if self._approx_index is None:
while True:
if not self.setup_for_mmap:
self._setup_for_mmap()
try:
approx_index = AnnoyIndex(self.emb_dim, metric='angular')
approx_index.load(self.path_to_approx_mmap)
self._approx_index = approx_index
break
except BaseException:
path_to_approx_mmap_temp = self.path_to_approx_mmap \
+ '.tmp'
tlock = self.APPROX_MMAP_THREAD_LOCK.acquire(False)
plock = self.APPROX_MMAP_PROCESS_LOCK.acquire(0)
if tlock and plock:
try:
with open(path_to_approx_mmap_temp, "w+b") \
as mmap_file:
for chunk in chunks:
mmap_file.write(chunk)
if not self.closed:
os.rename(path_to_approx_mmap_temp,
self.path_to_approx_mmap)
else:
return
finally:
self.APPROX_MMAP_THREAD_LOCK.release()
try:
self.APPROX_MMAP_PROCESS_LOCK.release()
except BaseException:
pass
sleep(1) # Block before trying again
return self._approx_index
示例26
def generator_from_index(X, Y, index_path, k, batch_size, search_k=-1,
precompute=True, verbose=1):
if k >= X.shape[0] - 1:
raise Exception('''k value greater than or equal to (num_rows - 1)
(k={}, rows={}). Lower k to a smaller
value.'''.format(k, X.shape[0]))
if batch_size > X.shape[0]:
raise Exception('''batch_size value larger than num_rows in dataset
(batch_size={}, rows={}). Lower batch_size to a
smaller value.'''.format(batch_size, X.shape[0]))
if Y is None:
if precompute:
if verbose > 0:
print('Extracting KNN from index')
neighbour_matrix = extract_knn(X, index_path, k=k,
search_k=search_k, verbose=verbose)
return create_knn_triplet_dataset(X, neighbour_matrix,batch_size=batch_size)
else:
index = AnnoyIndex(X.shape[1], metric='angular')
index.load(index_path)
return create_annoy_triplet_dataset(X, index, k=k,
batch_size=batch_size,
search_k=search_k)
else:
if precompute:
if verbose > 0:
print('Extracting KNN from index')
neighbour_matrix = extract_knn(X, index_path, k=k,
search_k=search_k, verbose=verbose)
return create_labeled_knn_triplet_dataset(X, Y, neighbour_matrix,
batch_size=batch_size)
else:
index = AnnoyIndex(X.shape[1], metric='angular')
index.load(index_path)
return create_labeled_annoy_triplet_dataset(X, Y, index,
k=k, batch_size=batch_size,
search_k=search_k)
示例27
def test_dense_annoy_index(annoy_index_file):
data = np.random.choice([0, 1], size=(10, 5))
index = build_annoy_index(data, annoy_index_file)
assert os.path.exists(annoy_index_file)
loaded_index = AnnoyIndex(5, metric='angular')
loaded_index.load(annoy_index_file)
assert index.f == loaded_index.f == 5
assert index.get_n_items() == loaded_index.get_n_items() == 10
assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)
index.unload()
loaded_index.unload()
示例28
def __init__(self, index_file):
logging.info('Initialising matching utility...')
self.index = AnnoyIndex(VECTOR_LENGTH)
self.index.load(index_file, prefault=True)
logging.info('Annoy index {} is loaded'.format(index_file))
with open(index_file + '.mapping', 'rb') as handle:
self.mapping = pickle.load(handle)
logging.info('Mapping file {} is loaded'.format(index_file + '.mapping'))
logging.info('Matching utility initialised.')
示例29
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors):
user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix)
# print(user_lemma_matrix_tfidf.shape)
if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality):
X_svd = user_lemma_matrix_tfidf.toarray()
else:
X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf)
annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric)
for q in range(X_svd.shape[0]):
annoy_index.add_item(q, X_svd[q, :])
annoy_index.build(number_of_estimators)
row = list()
col = list()
data = list()
for q in range(X_svd.shape[0]):
neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True)
row.extend([q] * number_of_neighbors)
col.extend(neighbors)
data.extend(distances)
row = np.array(row, dtype=np.int64)
col = np.array(col, dtype=np.int64)
data = np.array(data, dtype=np.float64)
text_graph = spsp.coo_matrix((data,
(row,
col)),
shape=(X_svd.shape[0],
X_svd.shape[0]))
text_graph = spsp.csr_matrix(text_graph)
return text_graph
示例30
def index_embeddings(args):
"""Main run function for indexing the embeddings."""
unique_strings_path = args.infile + '.embedded.pkl_unique_strings.csv'
# Load the unique lines
with open(unique_strings_path) as f:
unique_strings = [line.rstrip() for line in f]
unique_embeddings_path = (args.infile +
'.embedded.pkl_unique_strings_embeddings.txt')
# Load the unique embeddings
with open(unique_embeddings_path) as f:
unique_embeddings = [[float(x) for x in
line.strip().split()] for line in f]
tf.logging.info('Loaded {} unique strings, {} embeddings of dimension {}'.
format(len(unique_strings),
len(unique_embeddings),
len(unique_embeddings[0])))
# Length of item vector that will be indexed
nn_forest = AnnoyIndex(512, metric='angular')
for i in range(len(unique_strings)):
v = unique_embeddings[i]
nn_forest.add_item(i, v)
# Build an approximate nearest neighbor forest with num_trees
nn_forest.build(int(args.num_trees))
output_path = args.infile + '.ann'
nn_forest.save(output_path)
tf.logging.info('Index forest built {}'.format(output_path))
return True