Lantern 向量数据库 (自动检索器) ¶
本指南展示如何在 LlamaIndex 中执行 **自动检索**。
许多流行的向量数据库除了支持用于语义搜索的查询字符串外,还支持一组元数据过滤器。给定一个自然语言查询,我们首先使用 LLM 推断出一组元数据过滤器以及要传递给向量数据库的正确查询字符串(两者都可以为空)。然后针对向量数据库执行这个整体查询捆绑包。
这使得检索形式更加动态和富有表现力,超越了 top-k 语义搜索。给定查询的相关上下文可能只需要按元数据标签进行过滤,或者需要在过滤后的集合中结合过滤和语义搜索,或者仅仅是原始的语义搜索。
我们将用 Lantern 演示一个示例,但自动检索也已在许多其他向量数据库中实现(例如 Pinecone、Chroma、Weaviate 等)。
如果您在 Colab 上打开此 Notebook,您可能需要安装 LlamaIndex 🦙。
输入 []
已复制!
%pip install llama-index-vector-stores-lantern
%pip install llama-index-vector-stores-lantern
输入 []
已复制!
!pip install llama-index psycopg2-binary asyncpg
!pip install llama-index psycopg2-binary asyncpg
输入 []
已复制!
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
输入 []
已复制!
# set up OpenAI
import os
os.environ["OPENAI_API_KEY"] = "<your-api-key>"
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]
# 设置 OpenAI import os os.environ["OPENAI_API_KEY"] = "" import openai openai.api_key = os.environ["OPENAI_API_KEY"]
输入 []
已复制!
import psycopg2
from sqlalchemy import make_url
connection_string = "postgresql://postgres:postgres@localhost:5432"
url = make_url(connection_string)
db_name = "postgres"
conn = psycopg2.connect(connection_string)
conn.autocommit = True
import psycopg2 from sqlalchemy import make_url connection_string = "postgresql://postgres:postgres@localhost:5432" url = make_url(connection_string) db_name = "postgres" conn = psycopg2.connect(connection_string) conn.autocommit = True
输入 []
已复制!
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.lantern import LanternVectorStore
from llama_index.core import VectorStoreIndex, StorageContext from llama_index.vector_stores.lantern import LanternVectorStore
输入 []
已复制!
from llama_index.core.schema import TextNode
nodes = [
TextNode(
text=(
"Michael Jordan is a retired professional basketball player,"
" widely regarded as one of the greatest basketball players of all"
" time."
),
metadata={
"category": "Sports",
"country": "United States",
},
),
TextNode(
text=(
"Angelina Jolie is an American actress, filmmaker, and"
" humanitarian. She has received numerous awards for her acting"
" and is known for her philanthropic work."
),
metadata={
"category": "Entertainment",
"country": "United States",
},
),
TextNode(
text=(
"Elon Musk is a business magnate, industrial designer, and"
" engineer. He is the founder, CEO, and lead designer of SpaceX,"
" Tesla, Inc., Neuralink, and The Boring Company."
),
metadata={
"category": "Business",
"country": "United States",
},
),
TextNode(
text=(
"Rihanna is a Barbadian singer, actress, and businesswoman. She"
" has achieved significant success in the music industry and is"
" known for her versatile musical style."
),
metadata={
"category": "Music",
"country": "Barbados",
},
),
TextNode(
text=(
"Cristiano Ronaldo is a Portuguese professional footballer who is"
" considered one of the greatest football players of all time. He"
" has won numerous awards and set multiple records during his"
" career."
),
metadata={
"category": "Sports",
"country": "Portugal",
},
),
]
from llama_index.core.schema import TextNode nodes = [ TextNode( text=( "迈克尔·乔丹是一位已退役的职业篮球运动员," "被广泛认为是史上最伟大的篮球运动员之一。" ), metadata={ "category": "Sports", "country": "United States", }, ), TextNode( text=( "安吉丽娜·朱莉是一位美国女演员、电影制片人和" "人道主义者。她凭借其表演获得了无数奖项,并以其慈善工作闻名。" ), metadata={ "category": "Entertainment", "country": "United States", }, ), TextNode( text=( "埃隆·马斯克是一位商业巨头、工业设计师和" "工程师。他是SpaceX、特斯拉公司、Neuralink和The Boring Company的创始人、首席执行官和首席设计师。" ), metadata={ "category": "Business", "country": "United States", }, ), TextNode( text=( "蕾哈娜是一位巴巴多斯歌手、演员和女商人。她" "在音乐界取得了巨大成功,并以其多变的音乐风格闻名。" ), metadata={ "category": "Music", "country": "Barbados", }, ), TextNode( text=( "克里斯蒂亚诺·罗纳尔多是一位葡萄牙职业足球运动员," "被认为是史上最伟大的足球运动员之一。他" "在职业生涯中赢得了无数奖项并创造了多项记录。" ), metadata={ "category": "Sports", "country": "Portugal", }, ), ]
使用 Lantern Vector Store 构建向量索引¶
在这里我们将数据加载到向量存储中。如上所述,每个节点的文本和元数据都将在 Lantern 中转换为相应的表示形式。我们现在可以对来自 Lantern 的这些数据运行语义查询以及元数据过滤。
输入 []
已复制!
vector_store = LanternVectorStore.from_params(
database=db_name,
host=url.host,
password=url.password,
port=url.port,
user=url.username,
table_name="famous_people",
embed_dim=1536, # openai embedding dimension
m=16, # HNSW M parameter
ef_construction=128, # HNSW ef construction parameter
ef=64, # HNSW ef search parameter
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
vector_store = LanternVectorStore.from_params( database=db_name, host=url.host, password=url.password, port=url.port, user=url.username, table_name="famous_people", embed_dim=1536, # openai embedding dimension m=16, # HNSW M parameter ef_construction=128, # HNSW ef construction parameter ef=64, # HNSW ef search parameter ) storage_context = StorageContext.from_defaults(vector_store=vector_store)
输入 []
已复制!
index = VectorStoreIndex(nodes, storage_context=storage_context)
index = VectorStoreIndex(nodes, storage_context=storage_context)
定义 VectorIndexAutoRetriever
¶
我们定义了核心的 VectorIndexAutoRetriever
模块。该模块接收 VectorStoreInfo
,其中包含向量存储集合的结构化描述以及它支持的元数据过滤器。这些信息随后将在自动检索提示中使用,LLM 会在此处推断出元数据过滤器。
输入 []
已复制!
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo
vector_store_info = VectorStoreInfo(
content_info="brief biography of celebrities",
metadata_info=[
MetadataInfo(
name="category",
type="str",
description=(
"Category of the celebrity, one of [Sports, Entertainment,"
" Business, Music]"
),
),
MetadataInfo(
name="country",
type="str",
description=(
"Country of the celebrity, one of [United States, Barbados,"
" Portugal]"
),
),
],
)
retriever = VectorIndexAutoRetriever(
index, vector_store_info=vector_store_info
)
from llama_index.core.retrievers import VectorIndexAutoRetriever from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo vector_store_info = VectorStoreInfo( content_info="名人简要传记", metadata_info=[ MetadataInfo( name="category", type="str", description=( "名人的类别,可以是 [体育, 娱乐," " 商业, 音乐] 之一" ), ), MetadataInfo( name="country", type="str", description=( "名人的国家,可以是 [美国, 巴巴多斯," " 葡萄牙] 之一" ), ), ], ) retriever = VectorIndexAutoRetriever( index, vector_store_info=vector_store_info )
运行一些示例数据¶
我们尝试运行一些示例数据。请注意元数据过滤器是如何被推断出来的——这有助于实现更精确的检索!
输入 []
已复制!
retriever.retrieve("Tell me about two celebrities from United States")
retriever.retrieve("告诉我两个来自美国的明星")