Hnswlib¶

Hnswlib 是一个快速的近似最近邻搜索索引。它是一个轻量级、仅头文件的 C++ HNSW 实现，除 C++11 外没有其他依赖项。Hnswlib 提供 Python 绑定。

In [ ]

已复制！

%pip install llama-index
%pip install llama-index-vector-stores-hnswlib
%pip install llama-index-embeddings-huggingface
%pip install hnswlib
%pip install llama-index %pip install llama-index-vector-stores-hnswlib %pip install llama-index-embeddings-huggingface %pip install hnswlib

导入包依赖¶

In [ ]

已复制！





from llama_index.vector_stores.hnswlib import HnswlibVectorStore
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    SimpleDirectoryReader,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.hnswlib import HnswlibVectorStore from llama_index.core import ( VectorStoreIndex, StorageContext, SimpleDirectoryReader, ) from llama_index.embeddings.huggingface import HuggingFaceEmbedding

加载示例数据¶

In [ ]

已复制！

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

读取数据¶

In [ ]

已复制！





documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
print(f"Total documents: {len(documents)}")
print(f"First document, id: {documents[0].doc_id}")
print(f"First document, hash: {documents[0].hash}")
print(
    "First document, text"
    f" ({len(documents[0].text)} characters):\n{'='*20}\n{documents[0].text[:360]} ..."
)
documents = SimpleDirectoryReader("./data/paul_graham/").load_data() print(f"总文档数：{len(documents)}") print(f"第一个文档，id：{documents[0].doc_id}") print(f"第一个文档，哈希：{documents[0].hash}") print( "第一个文档，文本" f" ({len(documents[0].text)} 字符)：\n{'='*20}\n{documents[0].text[:360]} ..." )

加载嵌入模型¶

In [ ]

已复制！

embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    normalize=True,
)
embed_model = HuggingFaceEmbedding( model_name="sentence-transformers/all-MiniLM-L6-v2", normalize=True, )

从 Hnswlib.Index 参数创建 Hnswlib Vector Store 对象¶

In [ ]

已复制！

hnswlib_vector_store = HnswlibVectorStore.from_params(
    space="ip",
    dimension=embed_model._model.get_sentence_embedding_dimension(),
    max_elements=1000,
)
hnswlib_vector_store = HnswlibVectorStore.from_params( space="ip", dimension=embed_model._model.get_sentence_embedding_dimension(), max_elements=1000, )

或者，您可以自己创建一个 Hnswlib.Index 对象。

In [ ]

已复制！

import hnswlib

index = hnswlib.Index(
    "ip", embed_model._model.get_sentence_embedding_dimension()
)
index.init_index(max_elements=1000)

hnswlib_vector_store = HnswlibVectorStore(index)
import hnswlib index = hnswlib.Index( "ip", embed_model._model.get_sentence_embedding_dimension() ) index.init_index(max_elements=1000) hnswlib_vector_store = HnswlibVectorStore(index)

从文档构建索引¶

In [ ]

已复制！





hnswlib_storage_context = StorageContext.from_defaults(
    vector_store=hnswlib_vector_store
)
hnswlib_index = VectorStoreIndex.from_documents(
    documents,
    storage_context=hnswlib_storage_context,
    embed_model=embed_model,
    show_progress=True,
)
hnswlib_storage_context = StorageContext.from_defaults( vector_store=hnswlib_vector_store ) hnswlib_index = VectorStoreIndex.from_documents( documents, storage_context=hnswlib_storage_context, embed_model=embed_model, show_progress=True, )

查询索引¶

In [ ]

已复制！





k = 5
query = "Before college I wrote what begginers should write."
hnswlib_vector_retriever = hnswlib_index.as_retriever(similarity_top_k=k)
nodes_with_scores = nodes_with_scores = hnswlib_vector_retriever.retrieve(
    query
)
for node in nodes_with_scores:
    print(f"Node {node.id_} | Score: {node.score:.3f} - {node.text[:120]}...")
k = 5 query = "在大学之前，我写了初学者应该写的东西。" hnswlib_vector_retriever = hnswlib_index.as_retriever(similarity_top_k=k) nodes_with_scores = nodes_with_scores = hnswlib_vector_retriever.retrieve( query ) for node in nodes_with_scores: print(f"节点 {node.id_} | 分数: {node.score:.3f} - {node.text[:120]}...")