Lantern 向量存储¶
在本 Notebook 中,我们将展示如何使用 Postgresql 和 Lantern 在 LlamaIndex 中执行向量搜索
如果您在 colab 中打开此 Notebook,您可能需要安装 LlamaIndex 🦙。
In [ ]
已复制!
%pip install llama-index-vector-stores-lantern
%pip install llama-index-embeddings-openai
%pip install llama-index-vector-stores-lantern %pip install llama-index-embeddings-openai
In [ ]
已复制!
!pip install psycopg2-binary llama-index asyncpg
!pip install psycopg2-binary llama-index asyncpg
In [ ]
已复制!
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.lantern import LanternVectorStore
import textwrap
import openai
from llama_index.core import SimpleDirectoryReader, StorageContext from llama_index.core import VectorStoreIndex from llama_index.vector_stores.lantern import LanternVectorStore import textwrap import openai
设置 OpenAI¶
第一步是配置 openai 密钥。它将用于为加载到索引中的文档创建嵌入。
In [ ]
已复制!
import os
os.environ["OPENAI_API_KEY"] = "<your_key>"
openai.api_key = "<your_key>"
import os os.environ["OPENAI_API_KEY"] = "" openai.api_key = ""
下载数据
In [ ]
已复制!
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
加载文档¶
使用 SimpleDirectoryReader 加载存储在 data/paul_graham/
中的文档
In [ ]
已复制!
documents = SimpleDirectoryReader("./data/paul_graham").load_data()
print("Document ID:", documents[0].doc_id)
documents = SimpleDirectoryReader("./data/paul_graham").load_data() print("文档 ID:", documents[0].doc_id)
创建数据库¶
使用在 localhost 上运行的现有 postgres 创建我们将使用的数据库。
In [ ]
已复制!
import psycopg2
connection_string = "postgresql://postgres:postgres@localhost:5432"
db_name = "postgres"
conn = psycopg2.connect(connection_string)
conn.autocommit = True
with conn.cursor() as c:
c.execute(f"DROP DATABASE IF EXISTS {db_name}")
c.execute(f"CREATE DATABASE {db_name}")
import psycopg2 connection_string = "postgresql://postgres:postgres@localhost:5432" db_name = "postgres" conn = psycopg2.connect(connection_string) conn.autocommit = True with conn.cursor() as c: c.execute(f"DROP DATABASE IF EXISTS {db_name}") c.execute(f"CREATE DATABASE {db_name}")
In [ ]
已复制!
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
# Setup global settings with embedding model
# So query strings will be transformed to embeddings and HNSW index will be used
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.core import Settings # Setup global settings with embedding model # So query strings will be transformed to embeddings and HNSW index will be used Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
创建索引¶
在这里,我们使用之前加载的文档创建一个由 Postgres 支持的索引。LanternVectorStore 接受几个参数。
In [ ]
已复制!
from sqlalchemy import make_url
url = make_url(connection_string)
vector_store = LanternVectorStore.from_params(
database=db_name,
host=url.host,
password=url.password,
port=url.port,
user=url.username,
table_name="paul_graham_essay",
embed_dim=1536, # openai embedding dimension
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context, show_progress=True
)
query_engine = index.as_query_engine()
from sqlalchemy import make_url url = make_url(connection_string) vector_store = LanternVectorStore.from_params( database=db_name, host=url.host, password=url.password, port=url.port, user=url.username, table_name="paul_graham_essay", embed_dim=1536, # openai embedding dimension ) storage_context = StorageContext.from_defaults(vector_store=vector_store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, show_progress=True ) query_engine = index.as_query_engine()
查询索引¶
现在我们可以使用我们的索引来提问了。
In [ ]
已复制!
response = query_engine.query("What did the author do?")
response = query_engine.query("What did the author do?")
In [ ]
已复制!
print(textwrap.fill(str(response), 100))
print(textwrap.fill(str(response), 100))
In [ ]
已复制!
response = query_engine.query("What happened in the mid 1980s?")
response = query_engine.query("What happened in the mid 1980s?")
In [ ]
已复制!
print(textwrap.fill(str(response), 100))
print(textwrap.fill(str(response), 100))
查询现有索引¶
In [ ]
已复制!
vector_store = LanternVectorStore.from_params(
database=db_name,
host=url.host,
password=url.password,
port=url.port,
user=url.username,
table_name="paul_graham_essay",
embed_dim=1536, # openai embedding dimension
m=16, # HNSW M parameter
ef_construction=128, # HNSW ef construction parameter
ef=64, # HNSW ef search parameter
)
# Read more about HNSW parameters here: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
query_engine = index.as_query_engine()
vector_store = LanternVectorStore.from_params( database=db_name, host=url.host, password=url.password, port=url.port, user=url.username, table_name="paul_graham_essay", embed_dim=1536, # openai embedding dimension m=16, # HNSW M parameter ef_construction=128, # HNSW ef construction parameter ef=64, # HNSW ef search parameter ) # Read more about HNSW parameters here: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md index = VectorStoreIndex.from_vector_store(vector_store=vector_store) query_engine = index.as_query_engine()
In [ ]
已复制!
response = query_engine.query("What did the author do?")
response = query_engine.query("What did the author do?")
In [ ]
已复制!
print(textwrap.fill(str(response), 100))
print(textwrap.fill(str(response), 100))
混合搜索¶
要启用混合搜索,您需要
- 在构建
LanternVectorStore
时传入hybrid_search=True
(并可选择使用所需语言配置text_search_config
) - 在构建查询引擎时传入
vector_store_query_mode="hybrid"
(此配置在底层传递给检索器)。您还可以选择设置sparse_top_k
来配置从稀疏文本搜索中应获得多少结果(默认为与similarity_top_k
相同的值)。
In [ ]
已复制!
from sqlalchemy import make_url
url = make_url(connection_string)
hybrid_vector_store = LanternVectorStore.from_params(
database=db_name,
host=url.host,
password=url.password,
port=url.port,
user=url.username,
table_name="paul_graham_essay_hybrid_search",
embed_dim=1536, # openai embedding dimension
hybrid_search=True,
text_search_config="english",
)
storage_context = StorageContext.from_defaults(
vector_store=hybrid_vector_store
)
hybrid_index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context
)
from sqlalchemy import make_url url = make_url(connection_string) hybrid_vector_store = LanternVectorStore.from_params( database=db_name, host=url.host, password=url.password, port=url.port, user=url.username, table_name="paul_graham_essay_hybrid_search", embed_dim=1536, # openai embedding dimension hybrid_search=True, text_search_config="english", ) storage_context = StorageContext.from_defaults( vector_store=hybrid_vector_store ) hybrid_index = VectorStoreIndex.from_documents( documents, storage_context=storage_context )
In [ ]
已复制!
hybrid_query_engine = hybrid_index.as_query_engine(
vector_store_query_mode="hybrid", sparse_top_k=2
)
hybrid_response = hybrid_query_engine.query(
"Who does Paul Graham think of with the word schtick"
)
hybrid_query_engine = hybrid_index.as_query_engine( vector_store_query_mode="hybrid", sparse_top_k=2 ) hybrid_response = hybrid_query_engine.query( "Who does Paul Graham think of with the word schtick" )
In [ ]
已复制!
print(hybrid_response)
print(hybrid_response)