递归检索器 + 节点引用 + Braintrust¶
本指南将展示如何使用递归检索来遍历节点关系,并根据“引用”来获取节点。
节点引用是一个强大的概念。首次执行检索时,您可能希望检索引用而不是原始文本。多个引用可以指向同一个节点。
在本指南中,我们将探讨节点引用的一些不同用法
- 块引用:不同大小的块引用更大的块
- 元数据引用:摘要 + 生成的问题引用更大的块
我们使用 Braintrust 评估了我们的递归检索 + 节点引用方法的有效性。Braintrust 是构建 AI 产品的企业级堆栈。从评估到提示词 playground 再到数据管理,我们将AI融入您的业务中的不确定性和繁琐性降至最低。
您可以在此处查看以下示例评估仪表板:
输入 [ ]
已复制!
%pip install llama-index-llms-openai
%pip install llama-index-readers-file
%pip install llama-index-llms-openai %pip install llama-index-readers-file
输入 [ ]
已复制!
%load_ext autoreload
%autoreload 2
# NOTE: Replace YOUR_OPENAI_API_KEY with your OpenAI API Key and YOUR_BRAINTRUST_API_KEY with your BrainTrust API key. Do not put it in quotes.
# Signup for Braintrust at https://braintrustdata.com/ and get your API key at https://www.braintrustdata.com/app/braintrustdata.com/settings/api-keys
# NOTE: Replace YOUR_OPENAI_KEY with your OpenAI API Key and YOUR_BRAINTRUST_API_KEY with your BrainTrust API key. Do not put it in quotes.
%env OPENAI_API_KEY=
%env BRAINTRUST_API_KEY=
%env TOKENIZERS_PARALLELISM=true # This is needed to avoid a warning message from Chroma
%load_ext autoreload %autoreload 2 # NOTE: Replace YOUR_OPENAI_API_KEY with your OpenAI API Key and YOUR_BRAINTRUST_API_KEY with your BrainTrust API key. Do not put it in quotes. # Signup for Braintrust at https://braintrustdata.com/ and get your API key at https://www.braintrustdata.com/app/braintrustdata.com/settings/api-keys # NOTE: Replace YOUR_OPENAI_KEY with your OpenAI API Key and YOUR_BRAINTRUST_API_KEY with your BrainTrust API key. Do not put it in quotes. %env OPENAI_API_KEY= %env BRAINTRUST_API_KEY= %env TOKENIZERS_PARALLELISM=true # This is needed to avoid a warning message from Chroma
输入 [ ]
已复制!
%pip install -U llama_hub llama_index braintrust autoevals pypdf pillow transformers torch torchvision
%pip install -U llama_hub llama_index braintrust autoevals pypdf pillow transformers torch torchvision
加载数据 + 设置¶
在本节中,我们将下载 Llama 2 论文并创建一组初始节点(块大小 1024)。
输入 [ ]
已复制!
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
!mkdir data !wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
输入 [ ]
已复制!
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
import json
from pathlib import Path from llama_index.readers.file import PDFReader from llama_index.core.response.notebook_utils import display_source_node from llama_index.core.retrievers import RecursiveRetriever from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core import VectorStoreIndex from llama_index.llms.openai import OpenAI import json
输入 [ ]
已复制!
loader = PDFReader()
docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
loader = PDFReader() docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
输入 [ ]
已复制!
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]
from llama_index.core import Document doc_text = "\n\n".join([d.get_content() for d in docs0]) docs = [Document(text=doc_text)]
输入 [ ]
已复制!
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.schema import IndexNode
输入 [ ]
已复制!
node_parser = SentenceSplitter(chunk_size=1024)
node_parser = SentenceSplitter(chunk_size=1024)
输入 [ ]
已复制!
base_nodes = node_parser.get_nodes_from_documents(docs)
# set node ids to be a constant
for idx, node in enumerate(base_nodes):
node.id_ = f"node-{idx}"
base_nodes = node_parser.get_nodes_from_documents(docs) # set node ids to be a constant for idx, node in enumerate(base_nodes): node.id_ = f"node-{idx}"
输入 [ ]
已复制!
from llama_index.core.embeddings import resolve_embed_model
embed_model = resolve_embed_model("local:BAAI/bge-small-en")
llm = OpenAI(model="gpt-3.5-turbo")
from llama_index.core.embeddings import resolve_embed_model embed_model = resolve_embed_model("local:BAAI/bge-small-en") llm = OpenAI(model="gpt-3.5-turbo")
基准检索器¶
定义一个基准检索器,它仅通过嵌入相似度获取 top-k 个原始文本节点。
输入 [ ]
已复制!
base_index = VectorStoreIndex(base_nodes, embed_model=embed_model)
base_retriever = base_index.as_retriever(similarity_top_k=2)
base_index = VectorStoreIndex(base_nodes, embed_model=embed_model) base_retriever = base_index.as_retriever(similarity_top_k=2)
输入 [ ]
已复制!
retrievals = base_retriever.retrieve(
"Can you tell me about the key concepts for safety finetuning"
)
retrievals = base_retriever.retrieve( "Can you tell me about the key concepts for safety finetuning" )
输入 [ ]
已复制!
for n in retrievals:
display_source_node(n, source_length=1500)
for n in retrievals: display_source_node(n, source_length=1500)
输入 [ ]
已复制!
query_engine_base = RetrieverQueryEngine.from_args(base_retriever, llm=llm)
query_engine_base = RetrieverQueryEngine.from_args(base_retriever, llm=llm)
输入 [ ]
已复制!
response = query_engine_base.query(
"Can you tell me about the key concepts for safety finetuning"
)
print(str(response))
response = query_engine_base.query( "Can you tell me about the key concepts for safety finetuning" ) print(str(response))
输入 [ ]
已复制!
sub_chunk_sizes = [128, 256, 512]
sub_node_parsers = [SentenceSplitter(chunk_size=c) for c in sub_chunk_sizes]
all_nodes = []
for base_node in base_nodes:
for n in sub_node_parsers:
sub_nodes = n.get_nodes_from_documents([base_node])
sub_inodes = [
IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes
]
all_nodes.extend(sub_inodes)
# also add original node to node
original_node = IndexNode.from_text_node(base_node, base_node.node_id)
all_nodes.append(original_node)
sub_chunk_sizes = [128, 256, 512] sub_node_parsers = [SentenceSplitter(chunk_size=c) for c in sub_chunk_sizes] all_nodes = [] for base_node in base_nodes: for n in sub_node_parsers: sub_nodes = n.get_nodes_from_documents([base_node]) sub_inodes = [ IndexNode.from_text_node(sn, base_node.node_id) for sn in sub_nodes ] all_nodes.extend(sub_inodes) # also add original node to node original_node = IndexNode.from_text_node(base_node, base_node.node_id) all_nodes.append(original_node)
输入 [ ]
已复制!
all_nodes_dict = {n.node_id: n for n in all_nodes}
all_nodes_dict = {n.node_id: n for n in all_nodes}
输入 [ ]
已复制!
vector_index_chunk = VectorStoreIndex(all_nodes, embed_model=embed_model)
vector_index_chunk = VectorStoreIndex(all_nodes, embed_model=embed_model)
输入 [ ]
已复制!
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=2)
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=2)
输入 [ ]
已复制!
retriever_chunk = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever_chunk},
node_dict=all_nodes_dict,
verbose=True,
)
retriever_chunk = RecursiveRetriever( "vector", retriever_dict={"vector": vector_retriever_chunk}, node_dict=all_nodes_dict, verbose=True, )
输入 [ ]
已复制!
nodes = retriever_chunk.retrieve(
"Can you tell me about the key concepts for safety finetuning"
)
for node in nodes:
display_source_node(node, source_length=2000)
nodes = retriever_chunk.retrieve( "Can you tell me about the key concepts for safety finetuning" ) for node in nodes: display_source_node(node, source_length=2000)
输入 [ ]
已复制!
query_engine_chunk = RetrieverQueryEngine.from_args(retriever_chunk, llm=llm)
query_engine_chunk = RetrieverQueryEngine.from_args(retriever_chunk, llm=llm)
输入 [ ]
已复制!
response = query_engine_chunk.query(
"Can you tell me about the key concepts for safety finetuning"
)
print(str(response))
response = query_engine_chunk.query( "Can you tell me about the key concepts for safety finetuning" ) print(str(response))
元数据引用:摘要 + 生成的问题引用较大的块¶
在此用法示例中,我们将展示如何定义引用源节点的附加上下文。
此附加上下文包括摘要以及生成的问题。
在查询时,我们检索较小的块,但会跟随引用到较大的块。这使我们能够获得更多上下文来进行合成。
输入 [ ]
已复制!
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import IndexNode
from llama_index.core.extractors import (
SummaryExtractor,
QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import SentenceSplitter from llama_index.core.schema import IndexNode from llama_index.core.extractors import ( SummaryExtractor, QuestionsAnsweredExtractor, )
输入 [ ]
已复制!
extractors = [
SummaryExtractor(summaries=["self"], show_progress=True),
QuestionsAnsweredExtractor(questions=5, show_progress=True),
]
extractors = [ SummaryExtractor(summaries=["self"], show_progress=True), QuestionsAnsweredExtractor(questions=5, show_progress=True), ]
输入 [ ]
已复制!
# run metadata extractor across base nodes, get back dictionaries
metadata_dicts = []
for extractor in extractors:
metadata_dicts.extend(extractor.extract(base_nodes))
# run metadata extractor across base nodes, get back dictionaries metadata_dicts = [] for extractor in extractors: metadata_dicts.extend(extractor.extract(base_nodes))
输入 [ ]
已复制!
# cache metadata dicts
def save_metadata_dicts(path):
with open(path, "w") as fp:
for m in metadata_dicts:
fp.write(json.dumps(m) + "\n")
def load_metadata_dicts(path):
with open(path, "r") as fp:
metadata_dicts = [json.loads(l) for l in fp.readlines()]
return metadata_dicts
# cache metadata dicts def save_metadata_dicts(path): with open(path, "w") as fp: for m in metadata_dicts: fp.write(json.dumps(m) + "\n") def load_metadata_dicts(path): with open(path, "r") as fp: metadata_dicts = [json.loads(l) for l in fp.readlines()] return metadata_dicts
输入 [ ]
已复制!
save_metadata_dicts("data/llama2_metadata_dicts.jsonl")
save_metadata_dicts("data/llama2_metadata_dicts.jsonl")
输入 [ ]
已复制!
metadata_dicts = load_metadata_dicts("data/llama2_metadata_dicts.jsonl")
metadata_dicts = load_metadata_dicts("data/llama2_metadata_dicts.jsonl")
输入 [ ]
已复制!
# all nodes consists of source nodes, along with metadata
import copy
all_nodes = copy.deepcopy(base_nodes)
for idx, d in enumerate(metadata_dicts):
inode_q = IndexNode(
text=d["questions_this_excerpt_can_answer"],
index_id=base_nodes[idx].node_id,
)
inode_s = IndexNode(
text=d["section_summary"], index_id=base_nodes[idx].node_id
)
all_nodes.extend([inode_q, inode_s])
# all nodes consists of source nodes, along with metadata import copy all_nodes = copy.deepcopy(base_nodes) for idx, d in enumerate(metadata_dicts): inode_q = IndexNode( text=d["questions_this_excerpt_can_answer"], index_id=base_nodes[idx].node_id, ) inode_s = IndexNode( text=d["section_summary"], index_id=base_nodes[idx].node_id ) all_nodes.extend([inode_q, inode_s])
输入 [ ]
已复制!
all_nodes_dict = {n.node_id: n for n in all_nodes}
all_nodes_dict = {n.node_id: n for n in all_nodes}
输入 [ ]
已复制!
## Load index into vector index
from llama_index.core import VectorStoreIndex
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-3.5-turbo")
vector_index_metadata = VectorStoreIndex(all_nodes)
## Load index into vector index from llama_index.core import VectorStoreIndex from llama_index.llms.openai import OpenAI llm = OpenAI(model="gpt-3.5-turbo") vector_index_metadata = VectorStoreIndex(all_nodes)
输入 [ ]
已复制!
vector_retriever_metadata = vector_index_metadata.as_retriever(
similarity_top_k=2
)
vector_retriever_metadata = vector_index_metadata.as_retriever( similarity_top_k=10 # Change to 10 )
输入 [ ]
已复制!
retriever_metadata = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever_metadata},
node_dict=all_nodes_dict,
verbose=True,
)
retriever_metadata = RecursiveRetriever( "vector", retriever_dict={"vector": vector_retriever_metadata}, node_dict=all_nodes_dict, verbose=False, # Change to False )
输入 [ ]
已复制!
nodes = retriever_metadata.retrieve(
"Can you tell me about the key concepts for safety finetuning"
)
for node in nodes:
display_source_node(node, source_length=2000)
nodes = retriever_metadata.retrieve( "Can you tell me about the key concepts for safety finetuning" ) for node in nodes: display_source_node(node, source_length=2000)
输入 [ ]
已复制!
query_engine_metadata = RetrieverQueryEngine.from_args(
retriever_metadata, llm=llm
)
query_engine_metadata = RetrieverQueryEngine.from_args( retriever_metadata, llm=llm )
输入 [ ]
已复制!
response = query_engine_metadata.query(
"Can you tell me about the key concepts for safety finetuning"
)
print(str(response))
response = query_engine_metadata.query( "Can you tell me about the key concepts for safety finetuning" ) print(str(response))
评估¶
我们使用 Braintrust 评估了我们的递归检索 + 节点引用方法的有效性。Braintrust 是构建 AI 产品的企业级堆栈。从评估到提示词 playground 再到数据管理,我们将AI融入您的业务中的不确定性和繁琐性降至最低。
我们评估了块引用和元数据引用。我们使用嵌入相似度查找来检索引用节点。我们将这两种方法与直接获取原始节点的基准检索器进行了比较。在指标方面,我们使用命中率和 MRR 进行评估。
您可以在此处查看以下示例评估仪表板:
数据集生成¶
我们首先从文本块集合中生成一个问题数据集。
输入 [ ]
已复制!
from llama_index.core.evaluation import (
generate_question_context_pairs,
EmbeddingQAFinetuneDataset,
)
import nest_asyncio
nest_asyncio.apply()
from llama_index.core.evaluation import ( generate_question_context_pairs, EmbeddingQAFinetuneDataset, ) import nest_asyncio nest_asyncio.apply()
输入 [ ]
已复制!
eval_dataset = generate_question_context_pairs(base_nodes)
eval_dataset = generate_question_context_pairs(base_nodes)
输入 [ ]
已复制!
eval_dataset.save_json("data/llama2_eval_dataset.json")
eval_dataset.save_json("data/llama2_eval_dataset.json")
输入 [ ]
已复制!
# optional
eval_dataset = EmbeddingQAFinetuneDataset.from_json(
"data/llama2_eval_dataset.json"
)
# optional eval_dataset = EmbeddingQAFinetuneDataset.from_json( "data/llama2_eval_dataset.json" )
输入 [ ]
已复制!
import pandas as pd
# set vector retriever similarity top k to higher
top_k = 10
def display_results(names, results_arr):
"""Display results from evaluate."""
hit_rates = []
mrrs = []
for name, eval_results in zip(names, results_arr):
metric_dicts = []
for eval_result in eval_results:
metric_dict = eval_result.metric_vals_dict
metric_dicts.append(metric_dict)
results_df = pd.DataFrame(metric_dicts)
hit_rate = results_df["hit_rate"].mean()
mrr = results_df["mrr"].mean()
hit_rates.append(hit_rate)
mrrs.append(mrr)
final_df = pd.DataFrame(
{"retrievers": names, "hit_rate": hit_rates, "mrr": mrrs}
)
display(final_df)
import pandas as pd # set vector retriever similarity top k to higher top_k = 10 def display_results(names, results_arr): """Display results from evaluate.""" hit_rates = [] mrrs = [] for name, eval_results in zip(names, results_arr): metric_dicts = [] for eval_result in eval_results: metric_dict = eval_result.metric_vals_dict metric_dicts.append(metric_dict) results_df = pd.DataFrame(metric_dicts) hit_rate = results_df["hit_rate"].mean() mrr = results_df["mrr"].mean() hit_rates.append(hit_rate) mrrs.append(mrr) final_df = pd.DataFrame( {"retrievers": names, "hit_rate": hit_rates, "mrr": mrrs} ) display(final_df)
让我们定义一些评分函数并定义我们的数据集数据变量。
输入 [ ]
已复制!
queries = eval_dataset.queries
relevant_docs = eval_dataset.relevant_docs
data = [
({"input": queries[query], "expected": relevant_docs[query]})
for query in queries.keys()
]
def hitRateScorer(input, expected, output=None):
is_hit = any([id in expected for id in output])
return 1 if is_hit else 0
def mrrScorer(input, expected, output=None):
for i, id in enumerate(output):
if id in expected:
return 1 / (i + 1)
return 0
queries = eval_dataset.queries relevant_docs = eval_dataset.relevant_docs data = [ ({"input": queries[query], "expected": relevant_docs[query]}) for query in queries.keys() ] def hitRateScorer(input, expected, output=None): is_hit = any([id in expected for id in output]) return 1 if is_hit else 0 def mrrScorer(input, expected, output=None): for i, id in enumerate(output): if id in expected: return 1 / (i + 1) return 0
输入 [ ]
已复制!
import braintrust
# Evaluate the chunk retriever
vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=10)
retriever_chunk = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever_chunk},
node_dict=all_nodes_dict,
verbose=False,
)
def runChunkRetriever(input, hooks):
retrieved_nodes = retriever_chunk.retrieve(input)
retrieved_ids = [node.node.node_id for node in retrieved_nodes]
return retrieved_ids
chunkEval = await braintrust.Eval(
name="llamaindex-recurisve-retrievers",
data=data,
task=runChunkRetriever,
scores=[hitRateScorer, mrrScorer],
)
import braintrust # Evaluate the chunk retriever vector_retriever_chunk = vector_index_chunk.as_retriever(similarity_top_k=10) retriever_chunk = RecursiveRetriever( "vector", retriever_dict={"vector": vector_retriever_chunk}, node_dict=all_nodes_dict, verbose=False, ) def runChunkRetriever(input, hooks): retrieved_nodes = retriever_chunk.retrieve(input) retrieved_ids = [node.node.node_id for node in retrieved_nodes] return retrieved_ids chunkEval = await braintrust.Eval( name="llamaindex-recurisve-retrievers", data=data, task=runChunkRetriever, scores=[hitRateScorer, mrrScorer], )
输入 [ ]
已复制!
# Evaluate the metadata retriever
vector_retriever_metadata = vector_index_metadata.as_retriever(
similarity_top_k=10
)
retriever_metadata = RecursiveRetriever(
"vector",
retriever_dict={"vector": vector_retriever_metadata},
node_dict=all_nodes_dict,
verbose=False,
)
def runMetaDataRetriever(input, hooks):
retrieved_nodes = retriever_metadata.retrieve(input)
retrieved_ids = [node.node.node_id for node in retrieved_nodes]
return retrieved_ids
metadataEval = await braintrust.Eval(
name="llamaindex-recurisve-retrievers",
data=data,
task=runMetaDataRetriever,
scores=[hitRateScorer, mrrScorer],
)
# Evaluate the metadata retriever vector_retriever_metadata = vector_index_metadata.as_retriever( similarity_top_k=10 ) retriever_metadata = RecursiveRetriever( "vector", retriever_dict={"vector": vector_retriever_metadata}, node_dict=all_nodes_dict, verbose=False, ) def runMetaDataRetriever(input, hooks): retrieved_nodes = retriever_metadata.retrieve(input) retrieved_ids = [node.node.node_id for node in retrieved_nodes] return retrieved_ids metadataEval = await braintrust.Eval( name="llamaindex-recurisve-retrievers", data=data, task=runMetaDataRetriever, scores=[hitRateScorer, mrrScorer], )
输入 [ ]
已复制!
# Evaluate the base retriever
base_retriever = base_index.as_retriever(similarity_top_k=10)
def runBaseRetriever(input, hooks):
retrieved_nodes = base_retriever.retrieve(input)
retrieved_ids = [node.node.node_id for node in retrieved_nodes]
return retrieved_ids
baseEval = await braintrust.Eval(
name="llamaindex-recurisve-retrievers",
data=data,
task=runBaseRetriever,
scores=[hitRateScorer, mrrScorer],
)
# Evaluate the base retriever base_retriever = base_index.as_retriever(similarity_top_k=10) def runBaseRetriever(input, hooks): retrieved_nodes = base_retriever.retrieve(input) retrieved_ids = [node.node.node_id for node in retrieved_nodes] return retrieved_ids baseEval = await braintrust.Eval( name="llamaindex-recurisve-retrievers", data=data, task=runBaseRetriever, scores=[hitRateScorer, mrrScorer], )