Lantern 向量存储¶

在本 Notebook 中，我们将展示如何使用 Postgresql 和 Lantern 在 LlamaIndex 中执行向量搜索

如果您在 colab 中打开此 Notebook，您可能需要安装 LlamaIndex 🦙。

In [ ]

已复制！

%pip install llama-index-vector-stores-lantern
%pip install llama-index-embeddings-openai
%pip install llama-index-vector-stores-lantern %pip install llama-index-embeddings-openai

In [ ]

已复制！

!pip install psycopg2-binary llama-index asyncpg
!pip install psycopg2-binary llama-index asyncpg

In [ ]

已复制！

from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.lantern import LanternVectorStore
import textwrap
import openai
from llama_index.core import SimpleDirectoryReader, StorageContext from llama_index.core import VectorStoreIndex from llama_index.vector_stores.lantern import LanternVectorStore import textwrap import openai

设置 OpenAI¶

第一步是配置 openai 密钥。它将用于为加载到索引中的文档创建嵌入。

In [ ]

已复制！

import os

os.environ["OPENAI_API_KEY"] = "<your_key>"
openai.api_key = "<your_key>"
import os os.environ["OPENAI_API_KEY"] = "" openai.api_key = ""

下载数据

In [ ]

已复制！

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

加载文档¶

使用 SimpleDirectoryReader 加载存储在 data/paul_graham/ 中的文档

In [ ]

已复制！

documents = SimpleDirectoryReader("./data/paul_graham").load_data()
print("Document ID:", documents[0].doc_id)
documents = SimpleDirectoryReader("./data/paul_graham").load_data() print("文档 ID:", documents[0].doc_id)

创建数据库¶

使用在 localhost 上运行的现有 postgres 创建我们将使用的数据库。

In [ ]

已复制！

import psycopg2

connection_string = "postgresql://postgres:postgres@localhost:5432"
db_name = "postgres"
conn = psycopg2.connect(connection_string)
conn.autocommit = True

with conn.cursor() as c:
    c.execute(f"DROP DATABASE IF EXISTS {db_name}")
    c.execute(f"CREATE DATABASE {db_name}")
import psycopg2 connection_string = "postgresql://postgres:postgres@localhost:5432" db_name = "postgres" conn = psycopg2.connect(connection_string) conn.autocommit = True with conn.cursor() as c: c.execute(f"DROP DATABASE IF EXISTS {db_name}") c.execute(f"CREATE DATABASE {db_name}")

In [ ]

已复制！

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

# Setup global settings with embedding model
# So query strings will be transformed to embeddings and HNSW index will be used
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.core import Settings # Setup global settings with embedding model # So query strings will be transformed to embeddings and HNSW index will be used Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

创建索引¶

在这里，我们使用之前加载的文档创建一个由 Postgres 支持的索引。LanternVectorStore 接受几个参数。

In [ ]

已复制！





from sqlalchemy import make_url

url = make_url(connection_string)
vector_store = LanternVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="paul_graham_essay",
    embed_dim=1536,  # openai embedding dimension
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, show_progress=True
)
query_engine = index.as_query_engine()
from sqlalchemy import make_url url = make_url(connection_string) vector_store = LanternVectorStore.from_params( database=db_name, host=url.host, password=url.password, port=url.port, user=url.username, table_name="paul_graham_essay", embed_dim=1536, # openai embedding dimension ) storage_context = StorageContext.from_defaults(vector_store=vector_store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context, show_progress=True ) query_engine = index.as_query_engine()

查询索引¶

现在我们可以使用我们的索引来提问了。

In [ ]

已复制！

response = query_engine.query("What did the author do?")
response = query_engine.query("What did the author do?")

In [ ]

已复制！

print(textwrap.fill(str(response), 100))
print(textwrap.fill(str(response), 100))

In [ ]

已复制！

response = query_engine.query("What happened in the mid 1980s?")
response = query_engine.query("What happened in the mid 1980s?")

In [ ]

已复制！

print(textwrap.fill(str(response), 100))
print(textwrap.fill(str(response), 100))

查询现有索引¶

In [ ]

已复制！





vector_store = LanternVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="paul_graham_essay",
    embed_dim=1536,  # openai embedding dimension
    m=16,  # HNSW M parameter
    ef_construction=128,  # HNSW ef construction parameter
    ef=64,  # HNSW ef search parameter
)

# Read more about HNSW parameters here: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md

index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
query_engine = index.as_query_engine()
vector_store = LanternVectorStore.from_params( database=db_name, host=url.host, password=url.password, port=url.port, user=url.username, table_name="paul_graham_essay", embed_dim=1536, # openai embedding dimension m=16, # HNSW M parameter ef_construction=128, # HNSW ef construction parameter ef=64, # HNSW ef search parameter ) # Read more about HNSW parameters here: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md index = VectorStoreIndex.from_vector_store(vector_store=vector_store) query_engine = index.as_query_engine()

In [ ]

已复制！

response = query_engine.query("What did the author do?")
response = query_engine.query("What did the author do?")

In [ ]

已复制！

print(textwrap.fill(str(response), 100))
print(textwrap.fill(str(response), 100))

混合搜索¶

要启用混合搜索，您需要

在构建 LanternVectorStore 时传入 hybrid_search=True（并可选择使用所需语言配置 text_search_config）
在构建查询引擎时传入 vector_store_query_mode="hybrid"（此配置在底层传递给检索器）。您还可以选择设置 sparse_top_k 来配置从稀疏文本搜索中应获得多少结果（默认为与 similarity_top_k 相同的值）。

In [ ]

已复制！





from sqlalchemy import make_url

url = make_url(connection_string)
hybrid_vector_store = LanternVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="paul_graham_essay_hybrid_search",
    embed_dim=1536,  # openai embedding dimension
    hybrid_search=True,
    text_search_config="english",
)

storage_context = StorageContext.from_defaults(
    vector_store=hybrid_vector_store
)
hybrid_index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
from sqlalchemy import make_url url = make_url(connection_string) hybrid_vector_store = LanternVectorStore.from_params( database=db_name, host=url.host, password=url.password, port=url.port, user=url.username, table_name="paul_graham_essay_hybrid_search", embed_dim=1536, # openai embedding dimension hybrid_search=True, text_search_config="english", ) storage_context = StorageContext.from_defaults( vector_store=hybrid_vector_store ) hybrid_index = VectorStoreIndex.from_documents( documents, storage_context=storage_context )

In [ ]

已复制！





hybrid_query_engine = hybrid_index.as_query_engine(
    vector_store_query_mode="hybrid", sparse_top_k=2
)
hybrid_response = hybrid_query_engine.query(
    "Who does Paul Graham think of with the word schtick"
)
hybrid_query_engine = hybrid_index.as_query_engine( vector_store_query_mode="hybrid", sparse_top_k=2 ) hybrid_response = hybrid_query_engine.query( "Who does Paul Graham think of with the word schtick" )

In [ ]

已复制！

print(hybrid_response)
print(hybrid_response)