Python源码示例:annoy.AnnoyIndex()

示例1
def nn_approx(ds1, ds2, knn=KNN, metric='manhattan', n_trees=10):
    # Build index.
    a = AnnoyIndex(ds2.shape[1], metric=metric)
    for i in range(ds2.shape[0]):
        a.add_item(i, ds2[i, :])
    a.build(n_trees)

    # Search index.
    ind = []
    for i in range(ds1.shape[0]):
        ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
    ind = np.array(ind)

    # Match.
    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((a, b_i))

    return match

# Find mutual nearest neighbors. 
示例2
def label_approx(X, sites, site_labels, k=1):
    from annoy import AnnoyIndex

    assert(X.shape[1] == sites.shape[1])

    # Build index over site points.
    aindex = AnnoyIndex(sites.shape[1], metric='euclidean')
    for i in range(sites.shape[0]):
        aindex.add_item(i, sites[i, :])
    aindex.build(10)

    labels = []
    for i in range(X.shape[0]):
        # Find nearest site point.
        nearest_sites = aindex.get_nns_by_vector(X[i, :], k)
        if len(nearest_sites) < 1:
            labels.append(None)
            continue
        label = Counter([
            site_labels[ns] for ns in nearest_sites
        ]).most_common(1)[0][0]
        labels.append(label)

    return np.array(labels) 
示例3
def get_approx_index_chunks(self):
        """Gets decompressed chunks of the AnnoyIndex of the vectors from
        the database."""
        try:
            db = self._db(force_new=True)
            with lz4.frame.LZ4FrameDecompressor() as decompressor:
                chunks = db.execute(
                    """
                        SELECT rowid,index_file
                        FROM `magnitude_approx`
                        WHERE trees = ?
                    """, (self.approx_trees,))
                for chunk in chunks:
                    yield decompressor.decompress(chunk[1])
                    if self.closed:
                        return
        except Exception as e:
            if self.closed:
                pass
            else:
                raise e 
示例4
def get_approx_index_chunks(self):
        """Gets decompressed chunks of the AnnoyIndex of the vectors from
        the database."""
        try:
            db = self._db(force_new=True)
            with lz4.frame.LZ4FrameDecompressor() as decompressor:
                chunks = db.execute(
                    """
                        SELECT rowid,index_file
                        FROM `magnitude_approx`
                        WHERE trees = ?
                    """, (self.approx_trees,))
                for chunk in chunks:
                    yield decompressor.decompress(chunk[1])
                    if self.closed:
                        return
        except Exception as e:
            if self.closed:
                pass
            else:
                raise e 
示例5
def get_approx_index_chunks(self):
        """Gets decompressed chunks of the AnnoyIndex of the vectors from
        the database."""
        try:
            db = self._db(force_new=True)
            with lz4.frame.LZ4FrameDecompressor() as decompressor:
                chunks = db.execute(
                    """
                        SELECT rowid,index_file
                        FROM `magnitude_approx`
                        WHERE trees = ?
                    """, (self.approx_trees,))
                for chunk in chunks:
                    yield decompressor.decompress(chunk[1])
                    if self.closed:
                        return
        except Exception as e:
            if self.closed:
                pass
            else:
                raise e 
示例6
def run(self):
        try:
            index = AnnoyIndex(self.n_dims, metric='angular')
            index.load(self.index_filepath)
            for i in range(self.data_indices[0], self.data_indices[1]):
                neighbour_indexes = index.get_nns_by_item(
                    i, self.k, search_k=self.search_k, include_distances=False)
                neighbour_indexes = np.array(neighbour_indexes,
                                                dtype=np.uint32)
                self.results_queue.put(
                    IndexNeighbours(row_index=i,
                                    neighbour_list=neighbour_indexes))
        except Exception as e:
            self.exception = e
        finally:
            self.results_queue.close() 
示例7
def test_build_sparse_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    sparse_data = csr_matrix(data)

    index = build_annoy_index(sparse_data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload() 
示例8
def create_tree(data,approx,metric,use_faiss,n_trees):
	'''
	Create a faiss/cKDTree/KDTree/annoy index for nearest neighbour lookup. All undescribed input
	as in ``bbknn.bbknn()``. Returns the resulting index.

	Input
	-----
	data : ``numppy.array``
		PCA coordinates of a batch's cells to index.
	'''
	if approx:
		ckd = AnnoyIndex(data.shape[1],metric=metric)
		for i in np.arange(data.shape[0]):
			ckd.add_item(i,data[i,:])
		ckd.build(n_trees)
	elif metric == 'euclidean':
		if 'faiss' in sys.modules and use_faiss:
			ckd = faiss.IndexFlatL2(data.shape[1])
			ckd.add(data)
		else:
			ckd = cKDTree(data)
	else:
		ckd = KDTree(data,metric=metric)
	return ckd 
示例9
def srs_positive_annoy(X, N, seed=None, replace=False, prenormalized=False):
    from annoy import AnnoyIndex

    n_samples, n_features = X.shape

    if not replace and N > n_samples:
        raise ValueError('Cannot sample {} elements from {} elements '
                         'without replacement'.format(N, n_samples))
    if not replace and N == n_samples:
        return range(N)

    if not seed is None:
        np.random.seed(seed)

    X = X - X.min(0)

    if not prenormalized:
        X = normalize(X).astype('float32')

    srs_idx = set()
    for i in range(N):
        aindex = AnnoyIndex(X.shape[1], metric='euclidean')
        for i in range(X.shape[0]):
            if i not in srs_idx:
                aindex.add_item(i, X[i, :])
        aindex.build(10)

        Phi_i = np.random.normal(size=(n_features))
        Phi_i /= np.linalg.norm(Phi_i)

        nearest_site = aindex.get_nns_by_vector(Phi_i, 1)
        srs_idx.add(nearest_site[0])

    return sorted(srs_idx) 
示例10
def fit(self, Ciu, show_progress=True):
        # delay loading the annoy library in case its not installed here
        import annoy

        # train the model
        super(AnnoyAlternatingLeastSquares, self).fit(Ciu, show_progress)

        # build up an Annoy Index with all the item_factors (for calculating
        # similar items)
        if self.approximate_similar_items:
            log.debug("Building annoy similar items index")

            self.similar_items_index = annoy.AnnoyIndex(
                self.item_factors.shape[1], 'angular')
            for i, row in enumerate(self.item_factors):
                self.similar_items_index.add_item(i, row)
            self.similar_items_index.build(self.n_trees)

        # build up a separate index for the inner product (for recommend
        # methods)
        if self.approximate_recommend:
            log.debug("Building annoy recommendation index")
            self.max_norm, extra = augment_inner_product_matrix(self.item_factors)
            self.recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular')
            for i, row in enumerate(extra):
                self.recommend_index.add_item(i, row)
            self.recommend_index.build(self.n_trees) 
示例11
def generateAnnoy(real, artificial, annoyFilename, dimensions):
    idx2vec = np.array(artificial[2])
    t = AnnoyIndex(dimensions)
    for j in range(len(artificial[2])):
        t.add_item(j,idx2vec[j])
    print('Done Adding items to AnnoyIndex')
    t.build(TREESIZE)
    print('Done Building AnnoyIndex')
    t.save(annoyFilename)
    return t 
示例12
def create_annoy(target_features):
    t = AnnoyIndex(layer_dimension)
    for idx, target_feature in enumerate(target_features):
        t.add_item(idx, target_feature)
    t.build(10)
    t.save(os.path.join(work_dir, 'annoy.ann')) 
示例13
def test_tree(self):
        t = AnnoyIndex(5, 'angular')
        t.add_item(1, [1,2,3,4,5])

        self.assertTrue(t.build(1)) 
示例14
def __build_index(self, index_file):
    self.embedding_size = self.embeddings.shape[1]

    self.index = an.AnnoyIndex(self.embedding_size, metric='angular')

    for embedding_ind in range(self.embeddings.shape[0]):
      embedding = self.embeddings[embedding_ind, :]
      self.index.add_item(embedding_ind, embedding)

    self.index.build(self.n_trees)

    if self.id_map is None:
      self.id_map = dict([(i, i) for i in range(self.embeddings.shape[0])])

    self.inverse_id_map = dict([(v,k) for k,v in self.id_map.items()])

    if index_file:
      embeddings_file = index_file + '.embeddings'
      state = {
        'embedding_size': self.embedding_size,
        'id_map': self.id_map,
      }

      self.index.save(embeddings_file)
      with open(index_file, 'wb') as _index_file:
        pickle.dump(state, _index_file) 
示例15
def __load_index(self, index_file):
    log.info('Loading index file from {}'.format(index_file))
    with open(index_file, 'rb') as _index_file:
      state = pickle.load(_index_file)
    self.embedding_size = state['embedding_size']
    self.id_map = state['id_map']
    embeddings_file = index_file + '.embeddings'
    self.index = an.AnnoyIndex(self.embedding_size, metric='angular')
    self.index.load(embeddings_file)
    self.inverse_id_map = dict([(v,k) for k,v in self.id_map.items()]) 
示例16
def __init__(self, vecs):
        assert len(vecs)>0, 'no vecs available to init AnnoyIndex'
        size = len(vecs[0])
        self.annoy_model = AnnoyIndex(size)
        for idx,vec in enumerate(vecs):
            self.annoy_model.add_item(idx, vec)
        self.annoy_model.build(50) 
示例17
def __init__(self, file_name, dim_vector=500):
        self.u = AnnoyIndex(dim_vector)
        self.u.load(file_name) 
示例18
def build_annoy_index(encoded, outfile):
    input_shape = encoded.shape
    f = input_shape[1]
    t = AnnoyIndex(f, metric='angular')  # Length of item vector that will be indexed
    for i,v in enumerate(encoded):
        t.add_item(i, v)

    t.build(100) # 10 trees
    if outfile is not None:
        t.save(outfile)

    return t 
示例19
def load_annoy_index(infile, z_dim):
    t = AnnoyIndex(z_dim)
    t.load(infile) # super fast, will just mmap the file
    return t 
示例20
def test():
    from annoy import AnnoyIndex
    import random

    f = 40
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    for i in range(1000):
        v = [random.gauss(0, 1) for z in range(f)]
        t.add_item(i, v)

    t.build(10) # 10 trees
    print("test-py-module passed...") 
示例21
def test():
    from annoy import AnnoyIndex
    import random

    f = 40
    t = AnnoyIndex(f)  # Length of item vector that will be indexed
    for i in range(1000):
        v = [random.gauss(0, 1) for z in range(f)]
        t.add_item(i, v)

    t.build(10) # 10 trees
    print("test-py-module passed...") 
示例22
def build_annoy_tree(facial_embeddings, tree_path,
                    annoy_metric='euclidean', annoy_trees_no=256):
    """
    Builds an annoy tree

    Args:
        facial_embeddings: List of facial embeddings to be indexed in tree
        tree_path: where the annoy tree will be saved
        annoy_metric: euclidean / angular
        annoy_tree_no: how many trees in the annoy forest? Larger = more accurate
    """

    # Annoy tree
    tree = AnnoyIndex(128, metric=annoy_metric)

    # Don't wanna store entire db into memory
    for idx, f in enumerate(tqdm(facial_embeddings)):
        # Sqlte errors sometimes?
        try:
            cur_np = string_to_np(f.latent_space)

            tree.add_item(idx, cur_np)

        except Exception as e:
            tqdm.write(str(e))

    tree.build(annoy_trees_no)
    tree.save(tree_path) 
示例23
def get_approx_index_chunks(self):
        """Gets decompressed chunks of the AnnoyIndex of the vectors from
        the database."""
        try:
            db = self._db(force_new=True, downloader=True)
            num_chunks = db.execute(
                """
                    SELECT COUNT(rowid)
                    FROM `magnitude_approx`
                    WHERE trees = ?
                """, (self.approx_trees,)).fetchall()[0][0]
            with lz4.frame.LZ4FrameDecompressor() as decompressor:
                chunks = db.execute(
                    """
                        SELECT rowid,index_file
                        FROM `magnitude_approx`
                        WHERE trees = ?
                    """, (self.approx_trees,))
                for chunk in chunks:
                    yield num_chunks, decompressor.decompress(chunk[1])
                    if self.closed:
                        return
        except Exception as e:
            if self.closed:
                pass
            else:
                raise e 
示例24
def get_approx_index(self):
        """Gets an AnnoyIndex of the vectors from the database."""
        chunks = self.get_approx_index_chunks()
        if self._approx_index is None:
            while True:
                if not self.setup_for_mmap:
                    self._setup_for_mmap()
                try:
                    approx_index = AnnoyIndex(self.emb_dim, metric='angular')
                    approx_index.load(self.path_to_approx_mmap)
                    self._approx_index = approx_index
                    break
                except BaseException:
                    path_to_approx_mmap_temp = self.path_to_approx_mmap \
                        + '.tmp'
                    tlock = self.APPROX_MMAP_THREAD_LOCK.acquire(False)
                    plock = self.APPROX_MMAP_PROCESS_LOCK.acquire(0)
                    if tlock and plock:
                        try:
                            with open(path_to_approx_mmap_temp, "w+b") \
                                    as mmap_file:
                                for chunk in chunks:
                                    mmap_file.write(chunk)
                            if not self.closed:
                                os.rename(path_to_approx_mmap_temp,
                                          self.path_to_approx_mmap)
                            else:
                                return
                        finally:
                            self.APPROX_MMAP_THREAD_LOCK.release()
                            try:
                                self.APPROX_MMAP_PROCESS_LOCK.release()
                            except BaseException:
                                pass
                sleep(1)  # Block before trying again
        return self._approx_index 
示例25
def get_approx_index(self):
        """Gets an AnnoyIndex of the vectors from the database."""
        chunks = self.get_approx_index_chunks()
        if self._approx_index is None:
            while True:
                if not self.setup_for_mmap:
                    self._setup_for_mmap()
                try:
                    approx_index = AnnoyIndex(self.emb_dim, metric='angular')
                    approx_index.load(self.path_to_approx_mmap)
                    self._approx_index = approx_index
                    break
                except BaseException:
                    path_to_approx_mmap_temp = self.path_to_approx_mmap \
                        + '.tmp'
                    tlock = self.APPROX_MMAP_THREAD_LOCK.acquire(False)
                    plock = self.APPROX_MMAP_PROCESS_LOCK.acquire(0)
                    if tlock and plock:
                        try:
                            with open(path_to_approx_mmap_temp, "w+b") \
                                    as mmap_file:
                                for chunk in chunks:
                                    mmap_file.write(chunk)
                            if not self.closed:
                                os.rename(path_to_approx_mmap_temp,
                                          self.path_to_approx_mmap)
                            else:
                                return
                        finally:
                            self.APPROX_MMAP_THREAD_LOCK.release()
                            try:
                                self.APPROX_MMAP_PROCESS_LOCK.release()
                            except BaseException:
                                pass
                sleep(1)  # Block before trying again
        return self._approx_index 
示例26
def generator_from_index(X, Y, index_path, k, batch_size, search_k=-1,
                         precompute=True, verbose=1):
    if k >= X.shape[0] - 1:
        raise Exception('''k value greater than or equal to (num_rows - 1)
                        (k={}, rows={}). Lower k to a smaller
                        value.'''.format(k, X.shape[0]))
    if batch_size > X.shape[0]:
        raise Exception('''batch_size value larger than num_rows in dataset
                        (batch_size={}, rows={}). Lower batch_size to a
                        smaller value.'''.format(batch_size, X.shape[0]))

    if Y is None:
        if precompute:
            if verbose > 0:
                print('Extracting KNN from index')

            neighbour_matrix = extract_knn(X, index_path, k=k,
                                           search_k=search_k, verbose=verbose)
            return create_knn_triplet_dataset(X, neighbour_matrix,batch_size=batch_size)
        else:
            index = AnnoyIndex(X.shape[1], metric='angular')
            index.load(index_path)
            return create_annoy_triplet_dataset(X, index, k=k,
                                                batch_size=batch_size,
                                                search_k=search_k)
    else:
        if precompute:
            if verbose > 0:
                print('Extracting KNN from index')

            neighbour_matrix = extract_knn(X, index_path, k=k,
                                           search_k=search_k, verbose=verbose)
            return create_labeled_knn_triplet_dataset(X, Y, neighbour_matrix,
                                                    batch_size=batch_size)
        else:
            index = AnnoyIndex(X.shape[1], metric='angular')
            index.load(index_path)
            return create_labeled_annoy_triplet_dataset(X, Y, index,
                                                        k=k, batch_size=batch_size,
                                                        search_k=search_k) 
示例27
def test_dense_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    index = build_annoy_index(data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload() 
示例28
def __init__(self, index_file):
    logging.info('Initialising matching utility...')
    self.index = AnnoyIndex(VECTOR_LENGTH)
    self.index.load(index_file, prefault=True)
    logging.info('Annoy index {} is loaded'.format(index_file))
    with open(index_file + '.mapping', 'rb') as handle:
      self.mapping = pickle.load(handle)
    logging.info('Mapping file {} is loaded'.format(index_file + '.mapping'))
    logging.info('Matching utility initialised.') 
示例29
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors):
    user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix)
    # print(user_lemma_matrix_tfidf.shape)
    if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality):
        X_svd = user_lemma_matrix_tfidf.toarray()
    else:
        X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf)

    annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric)

    for q in range(X_svd.shape[0]):
        annoy_index.add_item(q, X_svd[q, :])

    annoy_index.build(number_of_estimators)

    row = list()
    col = list()
    data = list()
    for q in range(X_svd.shape[0]):
        neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True)

        row.extend([q] * number_of_neighbors)
        col.extend(neighbors)
        data.extend(distances)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.array(data, dtype=np.float64)

    text_graph = spsp.coo_matrix((data,
                                  (row,
                                   col)),
                                 shape=(X_svd.shape[0],
                                        X_svd.shape[0]))
    text_graph = spsp.csr_matrix(text_graph)

    return text_graph 
示例30
def index_embeddings(args):
    """Main run function for indexing the embeddings."""
    unique_strings_path = args.infile + '.embedded.pkl_unique_strings.csv'

    # Load the unique lines
    with open(unique_strings_path) as f:
        unique_strings = [line.rstrip() for line in f]

    unique_embeddings_path = (args.infile +
                              '.embedded.pkl_unique_strings_embeddings.txt')
    # Load the unique embeddings
    with open(unique_embeddings_path) as f:
        unique_embeddings = [[float(x) for x in
                              line.strip().split()] for line in f]

    tf.logging.info('Loaded {} unique strings, {} embeddings of dimension {}'.
                    format(len(unique_strings),
                           len(unique_embeddings),
                           len(unique_embeddings[0])))

    # Length of item vector that will be indexed
    nn_forest = AnnoyIndex(512, metric='angular')

    for i in range(len(unique_strings)):
        v = unique_embeddings[i]
        nn_forest.add_item(i, v)

    # Build an approximate nearest neighbor forest with num_trees
    nn_forest.build(int(args.num_trees))
    output_path = args.infile + '.ann'
    nn_forest.save(output_path)

    tf.logging.info('Index forest built {}'.format(output_path))

    return True