设置¶
我们加载一些数据并定义一个我们将评估的非常简单的 RAG 查询引擎(使用 top-k 检索)。
%pip install llama-index-readers-file pymupdf
%pip install llama-index-llms-openai
!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
mkdir: data: File exists --2023-09-19 00:05:14-- https://arxiv.org/pdf/2307.09288.pdf Resolving arxiv.org (arxiv.org)... 128.84.21.199 Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 13661300 (13M) [application/pdf] Saving to: ‘data/llama2.pdf’ data/llama2.pdf 100%[===================>] 13.03M 1.56MB/s in 9.3s 2023-09-19 00:05:25 (1.40 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-4")
node_parser = SentenceSplitter(chunk_size=1024)
nodes = node_parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes)
query_engine = index.as_query_engine(llm=llm)
数据集生成¶
我们首先进行一个生成合成评估数据集的练习。我们通过从现有上下文中合成生成一组问题来完成此操作。然后,我们使用现有上下文通过强大的大语言模型(例如 GPT-4)运行每个问题,以生成一个“ground-truth”(真实)响应。
定义函数¶
我们定义将用于数据集生成的函数
from llama_index.core.schema import BaseNode
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Tuple, List
import re
llm = OpenAI(model="gpt-4")
我们定义 generate_answers_for_questions
函数,用于根据给定的上下文生成问题的答案。
QA_PROMPT = PromptTemplate(
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information and not prior knowledge, "
"answer the query.\n"
"Query: {query_str}\n"
"Answer: "
)
def generate_answers_for_questions(
questions: List[str], context: str, llm: OpenAI
) -> str:
"""Generate answers for questions given context."""
answers = []
for question in questions:
fmt_qa_prompt = QA_PROMPT.format(
context_str=context, query_str=question
)
response_obj = llm.complete(fmt_qa_prompt)
answers.append(str(response_obj))
return answers
我们定义 generate_qa_pairs
函数,用于在整个节点列表上生成问答对。
QUESTION_GEN_USER_TMPL = (
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information and not prior knowledge, "
"generate the relevant questions. "
)
QUESTION_GEN_SYS_TMPL = """\
You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided.\
"""
question_gen_template = ChatPromptTemplate(
message_templates=[
ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL),
ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL),
]
)
def generate_qa_pairs(
nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 10
) -> List[Tuple[str, str]]:
"""Generate questions."""
qa_pairs = []
for idx, node in enumerate(nodes):
print(f"Node {idx}/{len(nodes)}")
context_str = node.get_content(metadata_mode="all")
fmt_messages = question_gen_template.format_messages(
num_questions_per_chunk=10,
context_str=context_str,
)
chat_response = llm.chat(fmt_messages)
raw_output = chat_response.message.content
result_list = str(raw_output).strip().split("\n")
cleaned_questions = [
re.sub(r"^\d+[\).\s]", "", question).strip()
for question in result_list
]
answers = generate_answers_for_questions(
cleaned_questions, context_str, llm
)
cur_qa_pairs = list(zip(cleaned_questions, answers))
qa_pairs.extend(cur_qa_pairs)
return qa_pairs
qa_pairs
[('What is the main focus of the work described in the document?', 'The main focus of the work described in the document is the development and release of Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. The fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. The document also provides a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat.'), ('What is the range of parameters for the large language models (LLMs) developed in this work?', 'The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion.'), ('What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?', 'The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat.'), ('How do the models developed in this work compare to open-source chat models based on the benchmarks tested?', 'The models developed in this work, specifically the fine-tuned LLMs called Llama 2-Chat, outperform open-source chat models on most benchmarks tested.'), ('What are the two key areas of human evaluation mentioned in the document for the developed models?', 'The two key areas of human evaluation mentioned in the document for the developed models are helpfulness and safety.'), ('What is the purpose of providing a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat?', 'The purpose of providing a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat is to enable the community to build on their work and contribute to the responsible development of Large Language Models (LLMs).'), ('What is the intended benefit for the community from this work?', 'The intended benefit for the community from this work is to enable them to build on the work and contribute to the responsible development of large language models (LLMs). The team provides a detailed description of their approach to fine-tuning and safety improvements of Llama 2-Chat for this purpose.'), ('Who are the corresponding authors of this work and how can they be contacted?', 'The corresponding authors of this work are Thomas Scialom and Hugo Touvron. They can be contacted via email at [email protected] and [email protected] respectively.'), ('What is the source of the document and how many pages does it contain?', 'The source of the document is "1" and it contains 77 pages.'), ('Where can the contributions of all the authors be found in the document?', 'The contributions of all the authors can be found in Section A.1 of the document.')]
获取数据集上的对¶
注意:这可能需要很长时间。为了加快速度,请尝试输入节点的一个子集。
qa_pairs = generate_qa_pairs(
# nodes[:1],
nodes,
llm,
num_questions_per_chunk=10,
)
[可选] 定义保存/加载¶
# save
import pickle
pickle.dump(qa_pairs, open("eval_dataset.pkl", "wb"))
# save
import pickle
qa_pairs = pickle.load(open("eval_dataset.pkl", "rb"))
评估生成结果¶
在本节中,我们将介绍几种评估生成结果的方法。在高层面上,我们使用一个“评估大语言模型”来衡量生成结果的质量。我们在带标签和无标签设置下都这样做。
我们将介绍以下评估算法
- 正确性:将生成的答案与真实答案进行比较。
- 忠实度:评估响应是否忠实于上下文(无标签)。
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Dict
CORRECTNESS_SYS_TMPL = """
You are an expert evaluation system for a question answering chatbot.
You are given the following information:
- a user query,
- a reference answer, and
- a generated answer.
Your job is to judge the relevance and correctness of the generated answer.
Output a single score that represents a holistic evaluation.
You must return your response in a line with only the score.
Do not return answers in any other format.
On a separate line provide your reasoning for the score as well.
Follow these guidelines for scoring:
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
- If the generated answer is not relevant to the user query, \
you should give a score of 1.
- If the generated answer is relevant but contains mistakes, \
you should give a score between 2 and 3.
- If the generated answer is relevant and fully correct, \
you should give a score between 4 and 5.
"""
CORRECTNESS_USER_TMPL = """
## User Query
{query}
## Reference Answer
{reference_answer}
## Generated Answer
{generated_answer}
"""
eval_chat_template = ChatPromptTemplate(
message_templates=[
ChatMessage(role=MessageRole.SYSTEM, content=CORRECTNESS_SYS_TMPL),
ChatMessage(role=MessageRole.USER, content=CORRECTNESS_USER_TMPL),
]
)
现在我们已经定义了提示模板,接下来定义一个评估函数,它将提示输入给大语言模型并解析输出为一个结果字典。
from llama_index.llms.openai import OpenAI
def run_correctness_eval(
query_str: str,
reference_answer: str,
generated_answer: str,
llm: OpenAI,
threshold: float = 4.0,
) -> Dict:
"""Run correctness eval."""
fmt_messages = eval_chat_template.format_messages(
llm=llm,
query=query_str,
reference_answer=reference_answer,
generated_answer=generated_answer,
)
chat_response = llm.chat(fmt_messages)
raw_output = chat_response.message.content
# Extract from response
score_str, reasoning_str = raw_output.split("\n", 1)
score = float(score_str)
reasoning = reasoning_str.lstrip("\n")
return {"passing": score >= threshold, "score": score, "reason": reasoning}
现在让我们用聊天模型(GPT-4)在一些示例输入上尝试运行它。
llm = OpenAI(model="gpt-4")
# query_str = "What is the range of parameters for the large language models (LLMs) developed in this work?"
# reference_answer = "The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion."
query_str = (
"What is the specific name given to the fine-tuned LLMs optimized for"
" dialogue use cases?"
)
reference_answer = (
"The specific name given to the fine-tuned LLMs optimized for dialogue use"
" cases is Llama 2-Chat."
)
generated_answer = str(query_engine.query(query_str))
print(str(generated_answer))
The fine-tuned Large Language Models (LLMs) optimized for dialogue use cases are specifically called Llama 2-Chat.
eval_results = run_correctness_eval(
query_str, reference_answer, generated_answer, llm=llm, threshold=4.0
)
display(eval_results)
{'passing': True, 'score': 5.0, 'reason': 'The generated answer is completely relevant to the user query and matches the reference answer in terms of information. It correctly identifies "Llama 2-Chat" as the specific name given to the fine-tuned LLMs optimized for dialogue use cases.'}
构建忠实度评估器¶
忠实度评估器评估响应是否忠实于检索到的任何上下文。
这比正确性评估器复杂一步。由于上下文集合可能非常长,它们可能会溢出上下文窗口。我们需要找出如何实现一种响应合成策略,以便按顺序迭代上下文。
我们有相应的教程展示如何从零开始构建响应合成。我们也有开箱即用的响应合成模块。本指南中我们将使用开箱即用的模块。
EVAL_TEMPLATE = PromptTemplate(
"Please tell if a given piece of information "
"is supported by the context.\n"
"You need to answer with either YES or NO.\n"
"Answer YES if any of the context supports the information, even "
"if most of the context is unrelated. "
"Some examples are provided below. \n\n"
"Information: Apple pie is generally double-crusted.\n"
"Context: An apple pie is a fruit pie in which the principal filling "
"ingredient is apples. \n"
"Apple pie is often served with whipped cream, ice cream "
"('apple pie à la mode'), custard or cheddar cheese.\n"
"It is generally double-crusted, with pastry both above "
"and below the filling; the upper crust may be solid or "
"latticed (woven of crosswise strips).\n"
"Answer: YES\n"
"Information: Apple pies tastes bad.\n"
"Context: An apple pie is a fruit pie in which the principal filling "
"ingredient is apples. \n"
"Apple pie is often served with whipped cream, ice cream "
"('apple pie à la mode'), custard or cheddar cheese.\n"
"It is generally double-crusted, with pastry both above "
"and below the filling; the upper crust may be solid or "
"latticed (woven of crosswise strips).\n"
"Answer: NO\n"
"Information: {query_str}\n"
"Context: {context_str}\n"
"Answer: "
)
EVAL_REFINE_TEMPLATE = PromptTemplate(
"We want to understand if the following information is present "
"in the context information: {query_str}\n"
"We have provided an existing YES/NO answer: {existing_answer}\n"
"We have the opportunity to refine the existing answer "
"(only if needed) with some more context below.\n"
"------------\n"
"{context_msg}\n"
"------------\n"
"If the existing answer was already YES, still answer YES. "
"If the information is present in the new context, answer YES. "
"Otherwise answer NO.\n"
)
注意:在当前的响应合成器设置中,我们没有为聊天端点分开系统消息和用户消息,因此我们只使用标准的 llm.complete
进行文本补全。
现在我们在下面定义我们的函数。由于我们既为给定上下文定义了标准评估模板,也为后续上下文定义了精炼模板,因此我们实现了“创建并精炼”的响应合成策略来获取答案。
from llama_index.core.response_synthesizers import Refine
from typing import List, Dict
def run_faithfulness_eval(
generated_answer: str,
contexts: List[str],
llm: OpenAI,
) -> Dict:
"""Run faithfulness eval."""
refine = Refine(
llm=llm,
text_qa_template=EVAL_TEMPLATE,
refine_template=EVAL_REFINE_TEMPLATE,
)
response_obj = refine.get_response(generated_answer, contexts)
response_txt = str(response_obj)
if "yes" in response_txt.lower():
passing = True
else:
passing = False
return {"passing": passing, "reason": str(response_txt)}
让我们在一些数据上试试看
# use the same query_str, and reference_answer as above
# query_str = "What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?"
# reference_answer = "The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat."
response = query_engine.query(query_str)
generated_answer = str(response)
context_list = [n.get_content() for n in response.source_nodes]
eval_results = run_faithfulness_eval(
generated_answer,
contexts=context_list,
llm=llm,
)
display(eval_results)
{'passing': True, 'reason': 'YES'}
在我们的评估数据集上运行评估¶
注意:为了加快速度/节省成本,我们提取了一个非常有限的样本。
import random sample_size = 5 qa_pairs_sample = random.sample(qa_pairs, sample_size)
import random
sample_size = 5
qa_pairs_sample = random.sample(qa_pairs, sample_size)
import pandas as pd
def run_evals(qa_pairs: List[Tuple[str, str]], llm: OpenAI, query_engine):
results_list = []
for question, reference_answer in qa_pairs:
response = query_engine.query(question)
generated_answer = str(response)
correctness_results = run_correctness_eval(
query_str,
reference_answer,
generated_answer,
llm=llm,
threshold=4.0,
)
faithfulness_results = run_faithfulness_eval(
generated_answer,
contexts=context_list,
llm=llm,
)
cur_result_dict = {
"correctness": correctness_results["passing"],
"faithfulness": faithfulness_results["passing"],
}
results_list.append(cur_result_dict)
return pd.DataFrame(results_list)
evals_df = run_evals(qa_pairs_sample, llm, query_engine)
evals_df["correctness"].mean()
0.4
evals_df["faithfulness"].mean()
0.6