Activeloop Deep Memory¶
如何在文档问答中使 RAG 的命中率提高 15% 以上?
检索增强生成器 (RAG) 最近获得了广泛关注。随着先进的 RAG 技术和 Agent 的出现,它们扩展了 RAG 能够实现的功能潜力。然而,一些挑战可能会限制 RAG 集成到生产环境中。在生产环境中实施 RAG 时需要考虑的主要因素是准确性(召回率)、成本和延迟。对于基本用例,将 OpenAI 的 Ada 模型与朴素相似性搜索结合可以产生令人满意的结果。但是,为了在搜索期间获得更高的准确性或召回率,可能需要采用高级检索技术。这些方法可能包括改变数据块大小、多次重写查询等,这可能会增加延迟和成本。Activeloop 的 Deep Memory,作为 Activeloop Deep Lake 用户可用的一个功能,通过引入一个微小的神经网络层来解决这些问题,该网络层经过训练以匹配用户查询与语料库中的相关数据。虽然这种添加在搜索过程中产生的延迟极小,但它可以将检索准确性提高高达 27%,并且保持成本效益和易于使用,无需任何其他高级 RAG 技术。
%pip install llama-index-vector-stores-deeplake
%pip install llama-index-llms-openai
import nest_asyncio
import os
import getpass
nest_asyncio.apply()
!pip install deeplake beautifulsoup4 html2text tiktoken openai llama-index python-dotenv
本教程将解析 deeplake 文档,并创建一个 RAG 系统,该系统可以回答文档中的问题。
本教程可分为几个部分
1. 数据集创建和摄取¶
让我使用 BeautifulSoup 解析所有链接,并将它们转换为 LlamaIndex 文档
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def get_all_links(url):
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to retrieve the page: {url}")
return []
soup = BeautifulSoup(response.content, "html.parser")
# Finding all 'a' tags which typically contain href attribute for links
links = [
urljoin(url, a["href"])
for a in soup.find_all("a", href=True)
if a["href"]
]
return links
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer
from llama_index.core import Document
def load_documents(url):
all_links = get_all_links(url)
loader = AsyncHtmlLoader(all_links)
docs = loader.load()
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs = [Document.from_langchain_format(doc) for doc in docs_transformed]
return docs
docs = load_documents("https://docs.deeplake.ai/en/stable/")
Fetching pages: 100%|##########| 120/120 [00:13<00:00, 8.70it/s]
len(docs)
120
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core import (
VectorStoreIndex,
SimpleDirectoryReader,
StorageContext,
)
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.llms.openai import OpenAI
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API token: ")
# # activeloop token is needed if you are not signed in using CLI: `activeloop login -u <USERNAME> -p <PASSWORD>`
os.environ["ACTIVELOOP_TOKEN"] = getpass.getpass(
"Enter your ActiveLoop API token: "
) # Get your API token from https://app.activeloop.ai, click on your profile picture in the top right corner, and select "API Tokens"
token = os.getenv("ACTIVELOOP_TOKEN")
vector_store = DeepLakeVectorStore(
dataset_path="hub://activeloop-test/deeplake_docs_deepmemory2",
overwrite=False, # set to True to overwrite the existing dataset
runtime={"tensor_db": True},
token=token,
)
Deep Lake Dataset in hub://activeloop-test/deeplake_docs_deepmemory2 already exists, loading from the storage
def create_modules(vector_store, docs=[], populate_vector_store=True):
if populate_vector_store:
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(docs)
else:
nodes = []
# by default, the node ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes):
node.id_ = f"node_{idx}"
llm = OpenAI(model="gpt-4")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
return storage_context, nodes, llm
(
storage_context,
nodes,
llm,
) = create_modules(
docs=docs,
vector_store=vector_store,
# populate_vector_store=False, # uncomment this line to skip populating the vector store
)
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
deep_memory_retriever = vector_index.as_retriever(
similarity_top_k=4, deep_memory=True
)
2. 训练 Deep Memory¶
上面我们展示了 deep_memory 的总体工作原理示意图。因此,正如您所看到的,为了训练它,您需要相关性数据、查询以及语料库数据(即我们想要查询的数据)。语料库数据已在上一节中填充;在这里,我们将生成问题和相关性数据。
questions
- 是一个字符串文本,其中每个字符串表示一个查询。relevance
- 包含每个问题的地面真值链接。可能存在多个文档包含给定问题的答案。因此,相关性是 List[List[tuple[str, float]]],其中外层列表表示查询,内层列表表示相关文档。元组包含一个字符串和浮点数对,其中字符串表示源文档的 id(对应于数据集中的 id 张量),而浮点数表示当前文档与问题的相关程度。
from llama_index.core.evaluation import (
generate_question_context_pairs,
EmbeddingQAFinetuneDataset,
)
import random
def create_train_test_datasets(
number_of_samples=600, llm=None, nodes=None, save=False
):
random_indices = random.sample(range(len(nodes)), number_of_samples)
ratio = int(len(random_indices) * 0.8)
train_indices = random_indices[:ratio]
test_indices = random_indices[ratio:]
train_nodes = [nodes[i] for i in train_indices]
test_nodes = [nodes[i] for i in test_indices]
train_qa_dataset = generate_question_context_pairs(
train_nodes, llm=llm, num_questions_per_chunk=1
)
test_qa_dataset = generate_question_context_pairs(
test_nodes, llm=llm, num_questions_per_chunk=1
)
# [optional] save
if save:
train_qa_dataset.save_json(
f"deeplake_docs_{number_of_samples}_train.json"
)
test_qa_dataset.save_json(
f"deeplake_docs_{number_of_samples}_test.json"
)
return train_qa_dataset, test_qa_dataset
train_qa_dataset, test_qa_dataset = create_train_test_datasets(
number_of_samples=600, llm=llm, nodes=nodes, save=True
)
4%|▍ | 19/480 [02:25<1:04:00, 8.33s/it]
train_qa_dataset = EmbeddingQAFinetuneDataset.from_json(
"deeplake_docs_600_train.json"
)
test_qa_dataset = EmbeddingQAFinetuneDataset.from_json(
"deeplake_docs_600_test.json"
)
def create_query_relevance(qa_dataset):
"""Function for converting llama-index dataset to correct format for deep memory training"""
queries = [text for _, text in qa_dataset.queries.items()]
relevant_docs = qa_dataset.relevant_docs
relevance = []
for doc in relevant_docs:
relevance.append([(relevant_docs[doc][0], 1)])
return queries, relevance
train_queries, train_relevance = create_query_relevance(train_qa_dataset)
test_queries, test_relevance = create_query_relevance(test_qa_dataset)
train_queries[:3]
['In the context of creating a bounding box tensor in a dataset, explain the significance of the "coords" argument and its keys "type" and "mode". What does the "type" key specify about the bounding box coordinates?', 'Explain the process of creating an intrinsics tensor and appending intrinsics matrices in the context of computer vision. What are the dimensions of the intrinsics parameters and what do they represent? Also, describe the concept of a Segmentation Mask Htype and its role in image processing.', 'In the context of querying for images in the MNIST Train Dataset using `ds.query`, what does the command "select * where labels == 0" signify and what is the expected output?']
train_relevance[:3]
[[('node_788', 1)], [('node_861', 1)], [('node_82', 1)]]
test_queries[:3]
['What are the steps to update the information of keypoints and connections in a tensor, and what types of data can be appended to keypoints?', 'What is the command to create a mesh tensor in DeepLake and what are the supported compressions? Also, explain how to append a ply file containing mesh data to this tensor.', 'What is a Sequence htype in the context of tensors and how does it function as a wrapper for other htypes? Provide examples.']
test_relevance[:3]
[[('node_933', 1)], [('node_671', 1)], [('node_471', 1)]]
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
job_id = vector_store.vectorstore.deep_memory.train(
queries=train_queries,
relevance=train_relevance,
embedding_function=embeddings.embed_documents,
)
Starting DeepMemory training job Your Deep Lake dataset has been successfully created!
Preparing training data for deepmemory:
Creating 483 embeddings in 1 batches of size 483:: 100%|██████████| 1/1 [00:03<00:00, 3.67s/it]
DeepMemory training job started. Job ID: 65421a5003888c9ca36c72e8
vector_store.vectorstore.deep_memory.status(job_id)
This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/adilkhan/deeplake_docs_deepmemory2 -------------------------------------------------------------- | 65421a5003888c9ca36c72e8 | -------------------------------------------------------------- | status | completed | -------------------------------------------------------------- | progress | eta: 12.2 seconds | | | recall@10: 67.01% (+18.56%) | -------------------------------------------------------------- | results | recall@10: 67.01% (+18.56%) | --------------------------------------------------------------
3. DeepMemory 评估¶
太棒了!训练带来了显著的改进!现在,让我们评估它在测试集上的表现。
recalls = vector_store.vectorstore.deep_memory.evaluate(
queries=test_queries,
relevance=test_relevance,
embedding_function=embeddings.embed_documents,
)
info Wed Nov 1 09:32:44 2023 GMT Added distance metric `deepmemory_distance`. Embedding queries took 0.95 seconds ---- Evaluating without Deep Memory ---- Recall@1: 12.5% Recall@3: 23.3% Recall@5: 30.8% Recall@10: 50.8% Recall@50: 94.2% Recall@100: 95.8% ---- Evaluating with Deep Memory ---- Recall@1: 11.7% Recall@3: 27.5% Recall@5: 40.8% Recall@10: 65.0% Recall@50: 96.7% Recall@100: 98.3%
令人印象深刻!我们在测试集上观察到召回率提高了 15%。接下来,让我们使用 RetrieverEvaluator 检查 MRR(平均倒数排名)和命中率。
import pandas as pd
def display_results(eval_results):
"""Display results from evaluate."""
hit_rates = []
mrrs = []
names = []
for name, eval_result in eval_results.items():
metric_dicts = []
for er in eval_result:
metric_dict = er.metric_vals_dict
metric_dicts.append(metric_dict)
full_df = pd.DataFrame(metric_dicts)
hit_rate = full_df["hit_rate"].mean()
mrr = full_df["mrr"].mean()
hit_rates.append(hit_rate)
mrrs.append(mrr)
names.append(name)
metric_df = pd.DataFrame(
[
{"retrievers": names[i], "hit_rate": hit_rates[i], "mrr": mrrs[i]}
for i in range(2)
],
)
return metric_df
评估使用 deep memory 的检索性能
from llama_index.core.evaluation import RetrieverEvaluator
deep_memory_retriever = vector_index.as_retriever(
similarity_top_k=10, vector_store_kwargs={"deep_memory": True}
)
dm_retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate"], retriever=deep_memory_retriever
)
dm_eval_results = await dm_retriever_evaluator.aevaluate_dataset(
test_qa_dataset, retriever=dm_retriever_evaluator
)
from llama_index.core.evaluation import RetrieverEvaluator
naive_retriever = vector_index.as_retriever(similarity_top_k=10)
naive_retriever_evaluator = RetrieverEvaluator.from_metric_names(
["mrr", "hit_rate"], retriever=naive_retriever
)
naive_eval_results = await naive_retriever_evaluator.aevaluate_dataset(
test_qa_dataset, retriever=naive_retriever
)
eval_results = {
f"{mode} with Deep Memory top-10 eval": eval_result
for mode, eval_result in zip(
["with", "without"], [dm_eval_results, naive_eval_results]
)
}
display_results(eval_results)
retrievers | hit_rate | mrr | |
---|---|---|---|
0 | with with Deep Memory top-10 eval | 0.650000 | 0.244775 |
1 | without with Deep Memory top-10 eval | 0.508333 | 0.215129 |
不仅命中率提高了,MRR 也提高了
4. Deep Memory 推理¶
query_engine = vector_index.as_query_engine(
vector_store_kwargs={"deep_memory": True}, llm=llm
)
response = query_engine.query(
"How can you connect your own storage to the deeplake?"
)
print(response)
info Wed Nov 1 11:37:33 2023 GMT Can't find any metric in the dataset. You can connect your own storage to deeplake by using the `connect()` function in the deeplake API.
query_engine = vector_index.as_query_engine(
vector_store_kwargs={"deep_memory": False}, llm=llm
)
response = query_engine.query(
"How can you connect your own storage to the deeplake?"
)
print(response)
The context does not provide information on how to connect your own storage to Deep Lake.
根据我们的观察,如果没有“deep memory”,我们的模型往往会产生不准确的结果,因为它检索到了错误的上下文。