定义自定义属性图检索器¶
本指南向您展示如何针对属性图定义自定义检索器。
这比使用我们现成的图谱检索器更复杂,但允许您对检索过程进行精细控制,以便更好地为您的应用程序量身定制。
我们将展示如何通过直接利用属性图存储来定义高级检索流程。我们将执行向量搜索和文本到 Cypher 的检索,然后通过重排序模块合并结果。
输入 [ ]
已复制!
%pip install llama-index
%pip install llama-index-graph-stores-neo4j
%pip install llama-index-postprocessor-cohere-rerank
%pip install llama-index %pip install llama-index-graph-stores-neo4j %pip install llama-index-postprocessor-cohere-rerank
设置并构建属性图¶
输入 [ ]
已复制!
import nest_asyncio
nest_asyncio.apply()
import nest_asyncio nest_asyncio.apply()
输入 [ ]
已复制!
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
import os os.environ["OPENAI_API_KEY"] = "sk-..."
加载 Paul Graham 文章¶
输入 [ ]
已复制!
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
输入 [ ]
已复制!
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
from llama_index.core import SimpleDirectoryReader documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
/Users/loganmarkewich/Library/Caches/pypoetry/virtualenvs/llama-index-bXUwlEfH-py3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
定义默认大型语言模型¶
输入 [ ]
已复制!
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.3)
embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.llms.openai import OpenAI llm = OpenAI(model="gpt-3.5-turbo", temperature=0.3) embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
/Users/loganmarkewich/Library/Caches/pypoetry/virtualenvs/llama-index-bXUwlEfH-py3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
设置 Neo4j¶
要在本地启动 Neo4j,首先确保已安装 docker。然后,您可以使用以下 docker 命令启动数据库
docker run \
-p 7474:7474 -p 7687:7687 \
-v $PWD/data:/data -v $PWD/plugins:/plugins \
--name neo4j-apoc \
-e NEO4J_apoc_export_file_enabled=true \
-e NEO4J_apoc_import_file_enabled=true \
-e NEO4J_apoc_import_file_use__neo4j__config=true \
-e NEO4JLABS_PLUGINS=\[\"apoc\"\] \
neo4j:latest
从这里,您可以在 http://localhost:7474/ 打开数据库。在此页面上,您将被要求登录。使用默认用户名/密码 neo4j 和 neo4j。
输入 [ ]
已复制!
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
graph_store = Neo4jPropertyGraphStore(
username="neo4j",
password="llamaindex",
url="bolt://localhost:7687",
)
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore graph_store = Neo4jPropertyGraphStore( username="neo4j", password="llamaindex", url="bolt://localhost:7687", )
构建属性图¶
输入 [ ]
已复制!
from llama_index.core import PropertyGraphIndex
index = PropertyGraphIndex.from_documents(
documents,
llm=llm,
embed_model=embed_model,
property_graph_store=graph_store,
show_progress=True,
)
from llama_index.core import PropertyGraphIndex index = PropertyGraphIndex.from_documents( documents, llm=llm, embed_model=embed_model, property_graph_store=graph_store, show_progress=True, )
输入 [ ]
已复制!
from llama_index.core.retrievers import (
CustomPGRetriever,
VectorContextRetriever,
TextToCypherRetriever,
)
from llama_index.core.graph_stores import PropertyGraphStore
from llama_index.core.vector_stores.types import VectorStore
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core.prompts import PromptTemplate
from llama_index.core.llms import LLM
from llama_index.postprocessor.cohere_rerank import CohereRerank
from typing import Optional, Any, Union
class MyCustomRetriever(CustomPGRetriever):
"""Custom retriever with cohere reranking."""
def init(
self,
## vector context retriever params
embed_model: Optional[BaseEmbedding] = None,
vector_store: Optional[VectorStore] = None,
similarity_top_k: int = 4,
path_depth: int = 1,
## text-to-cypher params
llm: Optional[LLM] = None,
text_to_cypher_template: Optional[Union[PromptTemplate, str]] = None,
## cohere reranker params
cohere_api_key: Optional[str] = None,
cohere_top_n: int = 2,
**kwargs: Any,
) -> None:
"""Uses any kwargs passed in from class constructor."""
self.vector_retriever = VectorContextRetriever(
self.graph_store,
include_text=self.include_text,
embed_model=embed_model,
vector_store=vector_store,
similarity_top_k=similarity_top_k,
path_depth=path_depth,
)
self.cypher_retriever = TextToCypherRetriever(
self.graph_store,
llm=llm,
text_to_cypher_template=text_to_cypher_template
## NOTE: you can attach other parameters here if you'd like
)
self.reranker = CohereRerank(
api_key=cohere_api_key, top_n=cohere_top_n
)
def custom_retrieve(self, query_str: str) -> str:
"""Define custom retriever with reranking.
Could return `str`, `TextNode`, `NodeWithScore`, or a list of those.
"""
nodes_1 = self.vector_retriever.retrieve(query_str)
nodes_2 = self.cypher_retriever.retrieve(query_str)
reranked_nodes = self.reranker.postprocess_nodes(
nodes_1 + nodes_2, query_str=query_str
)
## TMP: please change
final_text = "\n\n".join(
[n.get_content(metadata_mode="llm") for n in reranked_nodes]
)
return final_text
# optional async method
# async def acustom_retrieve(self, query_str: str) -> str:
# ...
from llama_index.core.retrievers import ( CustomPGRetriever, VectorContextRetriever, TextToCypherRetriever, ) from llama_index.core.graph_stores import PropertyGraphStore from llama_index.core.vector_stores.types import VectorStore from llama_index.core.embeddings import BaseEmbedding from llama_index.core.prompts import PromptTemplate from llama_index.core.llms import LLM from llama_index.postprocessor.cohere_rerank import CohereRerank from typing import Optional, Any, Union class MyCustomRetriever(CustomPGRetriever): """带有 cohere 重新排序的自定义检索器。""" def init( self, ## 向量上下文检索器参数 embed_model: Optional[BaseEmbedding] = None, vector_store: Optional[VectorStore] = None, similarity_top_k: int = 4, path_depth: int = 1, ## text-to-cypher 参数 llm: Optional[LLM] = None, text_to_cypher_template: Optional[Union[PromptTemplate, str]] = None, ## cohere 重排序器参数 cohere_api_key: Optional[str] = None, cohere_top_n: int = 2, **kwargs: Any, ) -> None: """使用从类构造函数传入的任何 kwargs。""" self.vector_retriever = VectorContextRetriever( self.graph_store, include_text=self.include_text, embed_model=embed_model, vector_store=vector_store, similarity_top_k=similarity_top_k, path_depth=path_depth, ) self.cypher_retriever = TextToCypherRetriever( self.graph_store, llm=llm, text_to_cypher_template=text_to_cypher_template ## 注意:你可以在此处附加其他参数 ) self.reranker = CohereRerank( api_key=cohere_api_key, top_n=cohere_top_n ) def custom_retrieve(self, query_str: str) -> str: """定义带有重新排序的自定义检索器。可以返回 `str`、`TextNode`、`NodeWithScore` 或它们的列表。 """ nodes_1 = self.vector_retriever.retrieve(query_str) nodes_2 = self.cypher_retriever.retrieve(query_str) reranked_nodes = self.reranker.postprocess_nodes( nodes_1 + nodes_2, query_str=query_str ) ## 临时:请修改 final_text = "\n\n".join( [n.get_content(metadata_mode="llm") for n in reranked_nodes] ) return final_text # 可选的异步方法 # async def acustom_retrieve(self, query_str: str) -> str: # ...
测试自定义检索器¶
现在让我们针对我们的数据初始化并测试自定义检索器!
为了构建完整的 RAG 管道,我们使用 RetrieverQueryEngine
将我们的检索器与 LLM 合成模块结合起来——这在属性图索引的底层也使用了。
输入 [ ]
已复制!
custom_sub_retriever = MyCustomRetriever(
index.property_graph_store,
include_text=True,
vector_store=index.vector_store,
cohere_api_key="...",
)
custom_sub_retriever = MyCustomRetriever( index.property_graph_store, include_text=True, vector_store=index.vector_store, cohere_api_key="...", )
输入 [ ]
已复制!
from llama_index.core.query_engine import RetrieverQueryEngine
query_engine = RetrieverQueryEngine.from_args(
index.as_retriever(sub_retrievers=[custom_sub_retriever]), llm=llm
)
from llama_index.core.query_engine import RetrieverQueryEngine query_engine = RetrieverQueryEngine.from_args( index.as_retriever(sub_retrievers=[custom_sub_retriever]), llm=llm )
尝试一个'基线'¶
我们与仅使用向量上下文的基线检索器进行比较。
输入 [ ]
已复制!
base_retriever = VectorContextRetriever(
index.property_graph_store, include_text=True
)
base_query_engine = index.as_query_engine(sub_retrievers=[base_retriever])
base_retriever = VectorContextRetriever( index.property_graph_store, include_text=True ) base_query_engine = index.as_query_engine(sub_retrievers=[base_retriever])
尝试一些查询¶
输入 [ ]
已复制!
response = query_engine.query("Did the author like programming?")
print(str(response))
response = query_engine.query("Did the author like programming?") print(str(response))
The author found working on programming challenging but satisfying, as indicated by the intense effort put into the project and the sense of accomplishment derived from solving complex problems while working on the code.
输入 [ ]
已复制!
response = base_query_engine.query("Did the author like programming?")
print(str(response))
response = base_query_engine.query("Did the author like programming?") print(str(response))
The author enjoyed programming, as evidenced by their early experiences with computers, such as writing simple games, creating programs for predicting rocket flights, and developing a word processor. These experiences indicate a genuine interest and enjoyment in programming activities.