本指南将向您展示如何对 RAG 进行超参数优化。
我们使用了新的实验性 ParamTuner 类,它允许对 RAG 函数进行超参数网格搜索。它有两种变体:
ParamTuner: 一种通过遍历所有参数进行参数调优的朴素方法。
RayTuneParamTuner: 由 Ray Tune 提供支持的超参数调优机制ParamTuner可以接受任何输出字典的函数。在此设置中,我们定义一个函数,该函数从一组文档(Llama 2 论文)构建一个基本的 RAG 摄取流水线,在评估数据集上运行它,并衡量正确性指标。
我们研究调优以下参数:
块大小
- Top k 值
- 输入 [ ]
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-readers-file pymupdf
%pip install llama-index-experimental-param-tuner
!pip install llama-index llama-hub
!mkdir data && wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
--2023-11-04 00:16:34-- https://arxiv.org/pdf/2307.09288.pdf Resolving arxiv.org (arxiv.org)... 128.84.21.199 Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 13661300 (13M) [application/pdf] Saving to: ‘data/llama2.pdf’ data/llama2.pdf 100%[===================>] 13.03M 533KB/s in 36s 2023-11-04 00:17:10 (376 KB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]
import nest_asyncio
nest_asyncio.apply()
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.readers.file import UnstructuredReader
from llama_index.readers.file import PyMuPDFReader
loader = PDFReader()
docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import IndexNode
在这里,我们为 Llama2 论文设置一个“黄金”评估数据集。
注意:我们从 Dropbox 中拉取此数据集。有关如何生成数据集的详细信息,请参阅我们的 DatasetGenerator 模块。
!wget "https://www.dropbox.com/scl/fi/fh9vsmmm8vu0j50l3ss38/llama2_eval_qr_dataset.json?rlkey=kkoaez7aqeb4z25gzc06ak6kb&dl=1" -O data/llama2_eval_qr_dataset.json
!wget "https://www.dropbox.com/scl/fi/fh9vsmmm8vu0j50l3ss38/llama2_eval_qr_dataset.json?rlkey=kkoaez7aqeb4z25gzc06ak6kb&dl=1" -O data/llama2_eval_qr_dataset.json
from llama_index.core.evaluation import QueryResponseDataset
# optional
eval_dataset = QueryResponseDataset.from_json(
"data/llama2_eval_qr_dataset.json"
)
eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]
在这里,我们定义了给定参数后需要优化的函数。
该函数具体执行以下操作:1) 从文档构建索引,2) 查询索引,并运行一些基本评估。
from llama_index.core import ( VectorStoreIndex, load_index_from_storage, StorageContext, ) from llama_index.experimental.param_tuner import ParamTuner from llama_index.core.param_tuner.base import TunedResult, RunResult from llama_index.core.evaluation.eval_utils import ( get_responses, aget_responses, ) from llama_index.core.evaluation import ( SemanticSimilarityEvaluator, BatchEvalRunner, ) from llama_index.llms.openai import OpenAI from llama_index.embeddings.openai import OpenAIEmbedding import os import numpy as np from pathlib import Path
from llama_index.core import (
VectorStoreIndex,
load_index_from_storage,
StorageContext,
)
from llama_index.experimental.param_tuner import ParamTuner
from llama_index.core.param_tuner.base import TunedResult, RunResult
from llama_index.core.evaluation.eval_utils import (
get_responses,
aget_responses,
)
from llama_index.core.evaluation import (
SemanticSimilarityEvaluator,
BatchEvalRunner,
)
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import os
import numpy as np
from pathlib import Path
def _build_index(chunk_size, docs): index_out_path = f"./storage_{chunk_size}" if not os.path.exists(index_out_path): Path(index_out_path).mkdir(parents=True, exist_ok=True) # parse docs node_parser = SimpleNodeParser.from_defaults(chunk_size=chunk_size) base_nodes = node_parser.get_nodes_from_documents(docs) # build index index = VectorStoreIndex(base_nodes) # save index to disk index.storage_context.persist(index_out_path) else: # rebuild storage context storage_context = StorageContext.from_defaults( persist_dir=index_out_path ) # load index index = load_index_from_storage( storage_context, ) return index def _get_eval_batch_runner(): evaluator_s = SemanticSimilarityEvaluator(embed_model=OpenAIEmbedding()) eval_batch_runner = BatchEvalRunner( {"semantic_similarity": evaluator_s}, workers=2, show_progress=True ) return eval_batch_runner
def _build_index(chunk_size, docs):
index_out_path = f"./storage_{chunk_size}"
if not os.path.exists(index_out_path):
Path(index_out_path).mkdir(parents=True, exist_ok=True)
# parse docs
node_parser = SimpleNodeParser.from_defaults(chunk_size=chunk_size)
base_nodes = node_parser.get_nodes_from_documents(docs)
# build index
index = VectorStoreIndex(base_nodes)
# save index to disk
index.storage_context.persist(index_out_path)
else:
# rebuild storage context
storage_context = StorageContext.from_defaults(
persist_dir=index_out_path
)
# load index
index = load_index_from_storage(
storage_context,
)
return index
def _get_eval_batch_runner():
evaluator_s = SemanticSimilarityEvaluator(embed_model=OpenAIEmbedding())
eval_batch_runner = BatchEvalRunner(
{"semantic_similarity": evaluator_s}, workers=2, show_progress=True
)
return eval_batch_runner
def objective_function(params_dict): chunk_size = params_dict["chunk_size"] docs = params_dict["docs"] top_k = params_dict["top_k"] eval_qs = params_dict["eval_qs"] ref_response_strs = params_dict["ref_response_strs"] # build index index = _build_index(chunk_size, docs) # query engine query_engine = index.as_query_engine(similarity_top_k=top_k) # get predicted responses pred_response_objs = get_responses( eval_qs, query_engine, show_progress=True ) # run evaluator # NOTE: can uncomment other evaluators eval_batch_runner = _get_eval_batch_runner() eval_results = eval_batch_runner.evaluate_responses( eval_qs, responses=pred_response_objs, reference=ref_response_strs ) # get semantic similarity metric mean_score = np.array( [r.score for r in eval_results["semantic_similarity"]] ).mean() return RunResult(score=mean_score, params=params_dict)
def objective_function(params_dict):
chunk_size = params_dict["chunk_size"]
docs = params_dict["docs"]
top_k = params_dict["top_k"]
eval_qs = params_dict["eval_qs"]
ref_response_strs = params_dict["ref_response_strs"]
# build index
index = _build_index(chunk_size, docs)
# query engine
query_engine = index.as_query_engine(similarity_top_k=top_k)
# get predicted responses
pred_response_objs = get_responses(
eval_qs, query_engine, show_progress=True
)
# run evaluator
# NOTE: can uncomment other evaluators
eval_batch_runner = _get_eval_batch_runner()
eval_results = eval_batch_runner.evaluate_responses(
eval_qs, responses=pred_response_objs, reference=ref_response_strs
)
# get semantic similarity metric
mean_score = np.array(
[r.score for r in eval_results["semantic_similarity"]]
).mean()
return RunResult(score=mean_score, params=params_dict)
async def aobjective_function(params_dict): chunk_size = params_dict["chunk_size"] docs = params_dict["docs"] top_k = params_dict["top_k"] eval_qs = params_dict["eval_qs"] ref_response_strs = params_dict["ref_response_strs"] # build index index = _build_index(chunk_size, docs) # query engine query_engine = index.as_query_engine(similarity_top_k=top_k) # get predicted responses pred_response_objs = await aget_responses( eval_qs, query_engine, show_progress=True ) # run evaluator # NOTE: can uncomment other evaluators eval_batch_runner = _get_eval_batch_runner() eval_results = await eval_batch_runner.aevaluate_responses( eval_qs, responses=pred_response_objs, reference=ref_response_strs ) # get semantic similarity metric mean_score = np.array( [r.score for r in eval_results["semantic_similarity"]] ).mean() return RunResult(score=mean_score, params=params_dict)
async def aobjective_function(params_dict):
chunk_size = params_dict["chunk_size"]
docs = params_dict["docs"]
top_k = params_dict["top_k"]
eval_qs = params_dict["eval_qs"]
ref_response_strs = params_dict["ref_response_strs"]
# build index
index = _build_index(chunk_size, docs)
# query engine
query_engine = index.as_query_engine(similarity_top_k=top_k)
# get predicted responses
pred_response_objs = await aget_responses(
eval_qs, query_engine, show_progress=True
)
# run evaluator
# NOTE: can uncomment other evaluators
eval_batch_runner = _get_eval_batch_runner()
eval_results = await eval_batch_runner.aevaluate_responses(
eval_qs, responses=pred_response_objs, reference=ref_response_strs
)
# get semantic similarity metric
mean_score = np.array(
[r.score for r in eval_results["semantic_similarity"]]
).mean()
return RunResult(score=mean_score, params=params_dict)
我们定义了要进行网格搜索的参数 param_dict 和固定参数 fixed_param_dict。
param_dict = {"chunk_size": [256, 512, 1024], "top_k": [1, 2, 5]} # param_dict = { # "chunk_size": [256], # "top_k": [1] # } fixed_param_dict = { "docs": docs, "eval_qs": eval_qs[:10], "ref_response_strs": ref_response_strs[:10], }
param_dict = {"chunk_size": [256, 512, 1024], "top_k": [1, 2, 5]}
# param_dict = {
# "chunk_size": [256],
# "top_k": [1]
# }
fixed_param_dict = {
"docs": docs,
"eval_qs": eval_qs[:10],
"ref_response_strs": ref_response_strs[:10],
}
在这里,我们运行默认的 Param Tuner,它同步或异步迭代所有超参数组合。
from llama_index.experimental.param_tuner import ParamTuner
from llama_index.experimental.param_tuner import ParamTuner
param_tuner = ParamTuner(
param_fn=objective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
show_progress=True,
)
results = param_tuner.tune()
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
Score: 0.9490885841089257 Top-k: 2 Chunk size: 512
# adjust test_idx for additional testing
test_idx = 6
p = results.run_results[test_idx].params
(results.run_results[test_idx].score, p["top_k"], p["chunk_size"])
(0.9263373628377412, 1, 256)
运行异步版本。
from llama_index.experimental.param_tuner import AsyncParamTuner
from llama_index.experimental.param_tuner import AsyncParamTuner
aparam_tuner = AsyncParamTuner(
aparam_fn=aobjective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
num_workers=2,
show_progress=True,
)
results = await aparam_tuner.atune()
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
Score: 0.9521222054806685 Top-k: 2 Chunk size: 512
在这里,我们运行由 Ray Tune 提供支持的调优器,Ray Tune 是一个用于可扩展超参数调优的库。
在 notebook 中,我们在本地运行它,但您也可以在集群上运行。
from llama_index.experimental.param_tuner import RayTuneParamTuner
from llama_index.experimental.param_tuner import RayTuneParamTuner
param_tuner = RayTuneParamTuner(
param_fn=objective_function,
param_dict=param_dict,
fixed_param_dict=fixed_param_dict,
run_config_dict={"storage_path": "/tmp/custom/ray_tune", "name": "my_exp"},
)
results = param_tuner.tune()
results.best_run_result.params.keys()
dict_keys(['docs', 'eval_qs', 'ref_response_strs', 'chunk_size', 'top_k'])
results.best_idx
0
best_result = results.best_run_result
best_top_k = results.best_run_result.params["top_k"]
best_chunk_size = results.best_run_result.params["chunk_size"]
print(f"Score: {best_result.score}")
print(f"Top-k: {best_top_k}")
print(f"Chunk size: {best_chunk_size}")
Score: 0.9486126773392092 Top-k: 2 Chunk size: 512