从零开始构建评估¶

我们展示了如何从零开始构建评估模块。这包括最终生成响应（输出为纯文本）的评估，以及检索器（输出为项目排序列表）的评估。

我们的评估部分提供了内置模块。

设置¶

我们加载一些数据并定义一个我们将评估的非常简单的 RAG 查询引擎（使用 top-k 检索）。

输入 [ ]

已复制！

%pip install llama-index-readers-file pymupdf
%pip install llama-index-llms-openai
%pip install llama-index-readers-file pymupdf %pip install llama-index-llms-openai

输入 [ ]

已复制！

!mkdir data
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
!mkdir data !wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

mkdir: data: File exists
--2023-09-19 00:05:14--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 128.84.21.199
Connecting to arxiv.org (arxiv.org)|128.84.21.199|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘data/llama2.pdf’

data/llama2.pdf     100%[===================>]  13.03M  1.56MB/s    in 9.3s    

2023-09-19 00:05:25 (1.40 MB/s) - ‘data/llama2.pdf’ saved [13661300/13661300]

输入 [ ]

已复制！

from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
from pathlib import Path from llama_index.readers.file import PyMuPDFReader

输入 [ ]

已复制！

loader = PyMuPDFReader()
documents = loader.load(file_path="./data/llama2.pdf")
loader = PyMuPDFReader() documents = loader.load(file_path="./data/llama2.pdf")

输入 [ ]

已复制！

from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex from llama_index.core.node_parser import SentenceSplitter from llama_index.llms.openai import OpenAI

输入 [ ]

已复制！

llm = OpenAI(model="gpt-4")
node_parser = SentenceSplitter(chunk_size=1024)
llm = OpenAI(model="gpt-4") node_parser = SentenceSplitter(chunk_size=1024)

输入 [ ]

已复制！

nodes = node_parser.get_nodes_from_documents(documents)
nodes = node_parser.get_nodes_from_documents(documents)

输入 [ ]

已复制！

index = VectorStoreIndex(nodes)
index = VectorStoreIndex(nodes)

输入 [ ]

已复制！

query_engine = index.as_query_engine(llm=llm)
query_engine = index.as_query_engine(llm=llm)

数据集生成¶

我们首先进行一个生成合成评估数据集的练习。我们通过从现有上下文中合成生成一组问题来完成此操作。然后，我们使用现有上下文通过强大的大语言模型（例如 GPT-4）运行每个问题，以生成一个“ground-truth”（真实）响应。

定义函数¶

我们定义将用于数据集生成的函数

输入 [ ]

已复制！





from llama_index.core.schema import BaseNode
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Tuple, List
import re

llm = OpenAI(model="gpt-4")
from llama_index.core.schema import BaseNode from llama_index.llms.openai import OpenAI from llama_index.core.llms import ChatMessage, MessageRole from llama_index.core import ChatPromptTemplate, PromptTemplate from typing import Tuple, List import re llm = OpenAI(model="gpt-4")

我们定义 generate_answers_for_questions 函数，用于根据给定的上下文生成问题的答案。

输入 [ ]

已复制！





QA_PROMPT = PromptTemplate(
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "answer the query.\n"
    "Query: {query_str}\n"
    "Answer: "
)


def generate_answers_for_questions(
    questions: List[str], context: str, llm: OpenAI
) -> str:
    """Generate answers for questions given context."""
    answers = []
    for question in questions:
        fmt_qa_prompt = QA_PROMPT.format(
            context_str=context, query_str=question
        )
        response_obj = llm.complete(fmt_qa_prompt)
        answers.append(str(response_obj))
    return answers
QA_PROMPT = PromptTemplate( "Context information is below.\n" "---------------------\n" "{context_str}\n" "---------------------\n" "Given the context information and not prior knowledge, " "answer the query.\n" "Query: {query_str}\n" "Answer: " ) def generate_answers_for_questions( questions: List[str], context: str, llm: OpenAI ) -> str: """Generate answers for questions given context.""" answers = [] for question in questions: fmt_qa_prompt = QA_PROMPT.format( context_str=context, query_str=question ) response_obj = llm.complete(fmt_qa_prompt) answers.append(str(response_obj)) return answers

我们定义 generate_qa_pairs 函数，用于在整个节点列表上生成问答对。

输入 [ ]

已复制！





QUESTION_GEN_USER_TMPL = (
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Given the context information and not prior knowledge, "
    "generate the relevant questions. "
)

QUESTION_GEN_SYS_TMPL = """\
You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided.\
"""

question_gen_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL),
    ]
)


def generate_qa_pairs(
    nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 10
) -> List[Tuple[str, str]]:
    """Generate questions."""
    qa_pairs = []
    for idx, node in enumerate(nodes):
        print(f"Node {idx}/{len(nodes)}")
        context_str = node.get_content(metadata_mode="all")
        fmt_messages = question_gen_template.format_messages(
            num_questions_per_chunk=10,
            context_str=context_str,
        )
        chat_response = llm.chat(fmt_messages)
        raw_output = chat_response.message.content
        result_list = str(raw_output).strip().split("\n")
        cleaned_questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip()
            for question in result_list
        ]
        answers = generate_answers_for_questions(
            cleaned_questions, context_str, llm
        )
        cur_qa_pairs = list(zip(cleaned_questions, answers))
        qa_pairs.extend(cur_qa_pairs)
    return qa_pairs
QUESTION_GEN_USER_TMPL = ( "Context information is below.\n" "---------------------\n" "{context_str}\n" "---------------------\n" "Given the context information and not prior knowledge, " "generate the relevant questions. " ) QUESTION_GEN_SYS_TMPL = """\ You are a Teacher/ Professor. Your task is to setup \ {num_questions_per_chunk} questions for an upcoming \ quiz/examination. The questions should be diverse in nature \ across the document. Restrict the questions to the \ context information provided.\ """ question_gen_template = ChatPromptTemplate( message_templates=[ ChatMessage(role=MessageRole.SYSTEM, content=QUESTION_GEN_SYS_TMPL), ChatMessage(role=MessageRole.USER, content=QUESTION_GEN_USER_TMPL), ] ) def generate_qa_pairs( nodes: List[BaseNode], llm: OpenAI, num_questions_per_chunk: int = 10 ) -> List[Tuple[str, str]]: """Generate questions.""" qa_pairs = [] for idx, node in enumerate(nodes): print(f"Node {idx}/{len(nodes)}") context_str = node.get_content(metadata_mode="all") fmt_messages = question_gen_template.format_messages( num_questions_per_chunk=10, context_str=context_str, ) chat_response = llm.chat(fmt_messages) raw_output = chat_response.message.content result_list = str(raw_output).strip().split("\n") cleaned_questions = [ re.sub(r"^\d+[\).\s]", "", question).strip() for question in result_list ] answers = generate_answers_for_questions( cleaned_questions, context_str, llm ) cur_qa_pairs = list(zip(cleaned_questions, answers)) qa_pairs.extend(cur_qa_pairs) return qa_pairs

输入 [ ]

已复制！

qa_pairs
qa_pairs

输出 [ ]

[('What is the main focus of the work described in the document?',
  'The main focus of the work described in the document is the development and release of Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. The fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. The document also provides a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat.'),
 ('What is the range of parameters for the large language models (LLMs) developed in this work?',
  'The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion.'),
 ('What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?',
  'The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat.'),
 ('How do the models developed in this work compare to open-source chat models based on the benchmarks tested?',
  'The models developed in this work, specifically the fine-tuned LLMs called Llama 2-Chat, outperform open-source chat models on most benchmarks tested.'),
 ('What are the two key areas of human evaluation mentioned in the document for the developed models?',
  'The two key areas of human evaluation mentioned in the document for the developed models are helpfulness and safety.'),
 ('What is the purpose of providing a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat?',
  'The purpose of providing a detailed description of the approach to fine-tuning and safety improvements of Llama 2-Chat is to enable the community to build on their work and contribute to the responsible development of Large Language Models (LLMs).'),
 ('What is the intended benefit for the community from this work?',
  'The intended benefit for the community from this work is to enable them to build on the work and contribute to the responsible development of large language models (LLMs). The team provides a detailed description of their approach to fine-tuning and safety improvements of Llama 2-Chat for this purpose.'),
 ('Who are the corresponding authors of this work and how can they be contacted?',
  'The corresponding authors of this work are Thomas Scialom and Hugo Touvron. They can be contacted via email at [email protected] and [email protected] respectively.'),
 ('What is the source of the document and how many pages does it contain?',
  'The source of the document is "1" and it contains 77 pages.'),
 ('Where can the contributions of all the authors be found in the document?',
  'The contributions of all the authors can be found in Section A.1 of the document.')]

获取数据集上的对¶

注意：这可能需要很长时间。为了加快速度，请尝试输入节点的一个子集。

输入 [ ]

已复制！





qa_pairs = generate_qa_pairs(
    # nodes[:1],
    nodes,
    llm,
    num_questions_per_chunk=10,
)
qa_pairs = generate_qa_pairs( # nodes[:1], nodes, llm, num_questions_per_chunk=10, )

[可选] 定义保存/加载¶

输入 [ ]

已复制！

# save
import pickle

pickle.dump(qa_pairs, open("eval_dataset.pkl", "wb"))
# save import pickle pickle.dump(qa_pairs, open("eval_dataset.pkl", "wb"))

输入 [ ]

已复制！

# save
import pickle

qa_pairs = pickle.load(open("eval_dataset.pkl", "rb"))
# save import pickle qa_pairs = pickle.load(open("eval_dataset.pkl", "rb"))

评估生成结果¶

在本节中，我们将介绍几种评估生成结果的方法。在高层面上，我们使用一个“评估大语言模型”来衡量生成结果的质量。我们在带标签和无标签设置下都这样做。

我们将介绍以下评估算法

正确性：将生成的答案与真实答案进行比较。
忠实度：评估响应是否忠实于上下文（无标签）。

构建正确性评估器¶

正确性评估器根据查询，将生成的答案与参考的真实答案进行比较。我们输出一个 1 到 5 之间的分数，其中 1 最差，5 最好。

我们通过聊天界面的系统和用户提示来完成此操作。

输入 [ ]

已复制！

from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate, PromptTemplate
from typing import Dict
from llama_index.core.llms import ChatMessage, MessageRole from llama_index.core import ChatPromptTemplate, PromptTemplate from typing import Dict

输入 [ ]

已复制！





CORRECTNESS_SYS_TMPL = """
You are an expert evaluation system for a question answering chatbot.

You are given the following information:
- a user query, 
- a reference answer, and
- a generated answer.

Your job is to judge the relevance and correctness of the generated answer.
Output a single score that represents a holistic evaluation.
You must return your response in a line with only the score.
Do not return answers in any other format.
On a separate line provide your reasoning for the score as well.

Follow these guidelines for scoring:
- Your score has to be between 1 and 5, where 1 is the worst and 5 is the best.
- If the generated answer is not relevant to the user query, \
you should give a score of 1.
- If the generated answer is relevant but contains mistakes, \
you should give a score between 2 and 3.
- If the generated answer is relevant and fully correct, \
you should give a score between 4 and 5.
"""

CORRECTNESS_USER_TMPL = """
## User Query
{query}

## Reference Answer
{reference_answer}

## Generated Answer
{generated_answer}
"""
CORRECTNESS_SYS_TMPL = """ You are an expert evaluation system for a question answering chatbot. You are given the following information: - a user query, - a reference answer, and - a generated answer. Your job is to judge the relevance and correctness of the generated answer. Output a single score that represents a holistic evaluation. You must return your response in a line with only the score. Do not return answers in any other format. On a separate line provide your reasoning for the score as well. Follow these guidelines for scoring: - Your score has to be between 1 and 5, where 1 is the worst and 5 is the best. - If the generated answer is not relevant to the user query, \ you should give a score of 1. - If the generated answer is relevant but contains mistakes, \ you should give a score between 2 and 3. - If the generated answer is relevant and fully correct, \ you should give a score between 4 and 5. """ CORRECTNESS_USER_TMPL = """ ## User Query {query} ## Reference Answer {reference_answer} ## Generated Answer {generated_answer} """

输入 [ ]

已复制！





eval_chat_template = ChatPromptTemplate(
    message_templates=[
        ChatMessage(role=MessageRole.SYSTEM, content=CORRECTNESS_SYS_TMPL),
        ChatMessage(role=MessageRole.USER, content=CORRECTNESS_USER_TMPL),
    ]
)
eval_chat_template = ChatPromptTemplate( message_templates=[ ChatMessage(role=MessageRole.SYSTEM, content=CORRECTNESS_SYS_TMPL), ChatMessage(role=MessageRole.USER, content=CORRECTNESS_USER_TMPL), ] )

现在我们已经定义了提示模板，接下来定义一个评估函数，它将提示输入给大语言模型并解析输出为一个结果字典。

输入 [ ]

已复制！





from llama_index.llms.openai import OpenAI


def run_correctness_eval(
    query_str: str,
    reference_answer: str,
    generated_answer: str,
    llm: OpenAI,
    threshold: float = 4.0,
) -> Dict:
    """Run correctness eval."""
    fmt_messages = eval_chat_template.format_messages(
        llm=llm,
        query=query_str,
        reference_answer=reference_answer,
        generated_answer=generated_answer,
    )
    chat_response = llm.chat(fmt_messages)
    raw_output = chat_response.message.content

    # Extract from response
    score_str, reasoning_str = raw_output.split("\n", 1)
    score = float(score_str)
    reasoning = reasoning_str.lstrip("\n")

    return {"passing": score >= threshold, "score": score, "reason": reasoning}
from llama_index.llms.openai import OpenAI def run_correctness_eval( query_str: str, reference_answer: str, generated_answer: str, llm: OpenAI, threshold: float = 4.0, ) -> Dict: """Run correctness eval.""" fmt_messages = eval_chat_template.format_messages( llm=llm, query=query_str, reference_answer=reference_answer, generated_answer=generated_answer, ) chat_response = llm.chat(fmt_messages) raw_output = chat_response.message.content # Extract from response score_str, reasoning_str = raw_output.split("\n", 1) score = float(score_str) reasoning = reasoning_str.lstrip("\n") return {"passing": score >= threshold, "score": score, "reason": reasoning}

现在让我们用聊天模型（GPT-4）在一些示例输入上尝试运行它。

输入 [ ]

已复制！

llm = OpenAI(model="gpt-4")
llm = OpenAI(model="gpt-4")

输入 [ ]

已复制！





# query_str = "What is the range of parameters for the large language models (LLMs) developed in this work?"
# reference_answer = "The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion."

query_str = (
    "What is the specific name given to the fine-tuned LLMs optimized for"
    " dialogue use cases?"
)
reference_answer = (
    "The specific name given to the fine-tuned LLMs optimized for dialogue use"
    " cases is Llama 2-Chat."
)
# query_str = "What is the range of parameters for the large language models (LLMs) developed in this work?" # reference_answer = "The range of parameters for the large language models (LLMs) developed in this work is from 7 billion to 70 billion." query_str = ( "What is the specific name given to the fine-tuned LLMs optimized for" " dialogue use cases?" ) reference_answer = ( "The specific name given to the fine-tuned LLMs optimized for dialogue use" " cases is Llama 2-Chat." )

输入 [ ]

已复制！

generated_answer = str(query_engine.query(query_str))
generated_answer = str(query_engine.query(query_str))

输入 [ ]

已复制！

print(str(generated_answer))
print(str(generated_answer))

The fine-tuned Large Language Models (LLMs) optimized for dialogue use cases are specifically called Llama 2-Chat.

输入 [ ]

已复制！

eval_results = run_correctness_eval(
    query_str, reference_answer, generated_answer, llm=llm, threshold=4.0
)
display(eval_results)
eval_results = run_correctness_eval( query_str, reference_answer, generated_answer, llm=llm, threshold=4.0 ) display(eval_results)

{'passing': True,
 'score': 5.0,
 'reason': 'The generated answer is completely relevant to the user query and matches the reference answer in terms of information. It correctly identifies "Llama 2-Chat" as the specific name given to the fine-tuned LLMs optimized for dialogue use cases.'}

构建忠实度评估器¶

忠实度评估器评估响应是否忠实于检索到的任何上下文。

这比正确性评估器复杂一步。由于上下文集合可能非常长，它们可能会溢出上下文窗口。我们需要找出如何实现一种响应合成策略，以便按顺序迭代上下文。

我们有相应的教程展示如何从零开始构建响应合成。我们也有开箱即用的响应合成模块。本指南中我们将使用开箱即用的模块。

输入 [ ]

已复制！





EVAL_TEMPLATE = PromptTemplate(
    "Please tell if a given piece of information "
    "is supported by the context.\n"
    "You need to answer with either YES or NO.\n"
    "Answer YES if any of the context supports the information, even "
    "if most of the context is unrelated. "
    "Some examples are provided below. \n\n"
    "Information: Apple pie is generally double-crusted.\n"
    "Context: An apple pie is a fruit pie in which the principal filling "
    "ingredient is apples. \n"
    "Apple pie is often served with whipped cream, ice cream "
    "('apple pie à la mode'), custard or cheddar cheese.\n"
    "It is generally double-crusted, with pastry both above "
    "and below the filling; the upper crust may be solid or "
    "latticed (woven of crosswise strips).\n"
    "Answer: YES\n"
    "Information: Apple pies tastes bad.\n"
    "Context: An apple pie is a fruit pie in which the principal filling "
    "ingredient is apples. \n"
    "Apple pie is often served with whipped cream, ice cream "
    "('apple pie à la mode'), custard or cheddar cheese.\n"
    "It is generally double-crusted, with pastry both above "
    "and below the filling; the upper crust may be solid or "
    "latticed (woven of crosswise strips).\n"
    "Answer: NO\n"
    "Information: {query_str}\n"
    "Context: {context_str}\n"
    "Answer: "
)

EVAL_REFINE_TEMPLATE = PromptTemplate(
    "We want to understand if the following information is present "
    "in the context information: {query_str}\n"
    "We have provided an existing YES/NO answer: {existing_answer}\n"
    "We have the opportunity to refine the existing answer "
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{context_msg}\n"
    "------------\n"
    "If the existing answer was already YES, still answer YES. "
    "If the information is present in the new context, answer YES. "
    "Otherwise answer NO.\n"
)
EVAL_TEMPLATE = PromptTemplate( "Please tell if a given piece of information " "is supported by the context.\n" "You need to answer with either YES or NO.\n" "Answer YES if any of the context supports the information, even " "if most of the context is unrelated. " "Some examples are provided below. \n\n" "Information: Apple pie is generally double-crusted.\n" "Context: An apple pie is a fruit pie in which the principal filling " "ingredient is apples. \n" "Apple pie is often served with whipped cream, ice cream " "('apple pie à la mode'), custard or cheddar cheese.\n" "It is generally double-crusted, with pastry both above " "and below the filling; the upper crust may be solid or " "latticed (woven of crosswise strips).\n" "Answer: YES\n" "Information: Apple pies tastes bad.\n" "Context: An apple pie is a fruit pie in which the principal filling " "ingredient is apples. \n" "Apple pie is often served with whipped cream, ice cream " "('apple pie à la mode'), custard or cheddar cheese.\n" "It is generally double-crusted, with pastry both above " "and below the filling; the upper crust may be solid or " "latticed (woven of crosswise strips).\n" "Answer: NO\n" "Information: {query_str}\n" "Context: {context_str}\n" "Answer: " ) EVAL_REFINE_TEMPLATE = PromptTemplate( "We want to understand if the following information is present " "in the context information: {query_str}\n" "We have provided an existing YES/NO answer: {existing_answer}\n" "We have the opportunity to refine the existing answer " "(only if needed) with some more context below.\n" "------------\n" "{context_msg}\n" "------------\n" "If the existing answer was already YES, still answer YES. " "If the information is present in the new context, answer YES. " "Otherwise answer NO.\n" )

注意：在当前的响应合成器设置中，我们没有为聊天端点分开系统消息和用户消息，因此我们只使用标准的 llm.complete 进行文本补全。

现在我们在下面定义我们的函数。由于我们既为给定上下文定义了标准评估模板，也为后续上下文定义了精炼模板，因此我们实现了“创建并精炼”的响应合成策略来获取答案。

输入 [ ]

已复制！





from llama_index.core.response_synthesizers import Refine
from typing import List, Dict


def run_faithfulness_eval(
    generated_answer: str,
    contexts: List[str],
    llm: OpenAI,
) -> Dict:
    """Run faithfulness eval."""

    refine = Refine(
        llm=llm,
        text_qa_template=EVAL_TEMPLATE,
        refine_template=EVAL_REFINE_TEMPLATE,
    )

    response_obj = refine.get_response(generated_answer, contexts)
    response_txt = str(response_obj)

    if "yes" in response_txt.lower():
        passing = True
    else:
        passing = False

    return {"passing": passing, "reason": str(response_txt)}
from llama_index.core.response_synthesizers import Refine from typing import List, Dict def run_faithfulness_eval( generated_answer: str, contexts: List[str], llm: OpenAI, ) -> Dict: """Run faithfulness eval.""" refine = Refine( llm=llm, text_qa_template=EVAL_TEMPLATE, refine_template=EVAL_REFINE_TEMPLATE, ) response_obj = refine.get_response(generated_answer, contexts) response_txt = str(response_obj) if "yes" in response_txt.lower(): passing = True else: passing = False return {"passing": passing, "reason": str(response_txt)}

让我们在一些数据上试试看

输入 [ ]

已复制！

# use the same query_str, and reference_answer as above
# query_str = "What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?"
# reference_answer = "The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat."

response = query_engine.query(query_str)
generated_answer = str(response)
# use the same query_str, and reference_answer as above # query_str = "What is the specific name given to the fine-tuned LLMs optimized for dialogue use cases?" # reference_answer = "The specific name given to the fine-tuned LLMs optimized for dialogue use cases is Llama 2-Chat." response = query_engine.query(query_str) generated_answer = str(response)

输入 [ ]

已复制！





context_list = [n.get_content() for n in response.source_nodes]
eval_results = run_faithfulness_eval(
    generated_answer,
    contexts=context_list,
    llm=llm,
)
display(eval_results)
context_list = [n.get_content() for n in response.source_nodes] eval_results = run_faithfulness_eval( generated_answer, contexts=context_list, llm=llm, ) display(eval_results)

{'passing': True, 'reason': 'YES'}

在我们的评估数据集上运行评估¶

注意：为了加快速度/节省成本，我们提取了一个非常有限的样本。

import random sample_size = 5 qa_pairs_sample = random.sample(qa_pairs, sample_size)

输入 [ ]

已复制！

import random

sample_size = 5
qa_pairs_sample = random.sample(qa_pairs, sample_size)
import pandas as pd def run_evals(qa_pairs: List[Tuple[str, str]], llm: OpenAI, query_engine): results_list = [] for question, reference_answer in qa_pairs: response = query_engine.query(question) generated_answer = str(response) correctness_results = run_correctness_eval( query_str, reference_answer, generated_answer, llm=llm, threshold=4.0, ) faithfulness_results = run_faithfulness_eval( generated_answer, contexts=context_list, llm=llm, ) cur_result_dict = { "correctness": correctness_results["passing"], "faithfulness": faithfulness_results["passing"], } results_list.append(cur_result_dict) return pd.DataFrame(results_list)

输入 [ ]

已复制！





import pandas as pd


def run_evals(qa_pairs: List[Tuple[str, str]], llm: OpenAI, query_engine):
    results_list = []
    for question, reference_answer in qa_pairs:
        response = query_engine.query(question)
        generated_answer = str(response)
        correctness_results = run_correctness_eval(
            query_str,
            reference_answer,
            generated_answer,
            llm=llm,
            threshold=4.0,
        )
        faithfulness_results = run_faithfulness_eval(
            generated_answer,
            contexts=context_list,
            llm=llm,
        )
        cur_result_dict = {
            "correctness": correctness_results["passing"],
            "faithfulness": faithfulness_results["passing"],
        }
        results_list.append(cur_result_dict)
    return pd.DataFrame(results_list)
evals_df = run_evals(qa_pairs_sample, llm, query_engine)

输入 [ ]

已复制！

evals_df = run_evals(qa_pairs_sample, llm, query_engine)
evals_df["correctness"].mean()

输入 [ ]

已复制！

evals_df["correctness"].mean()
evals_df["faithfulness"].mean()

输出 [ ]

0.4

输入 [ ]

已复制！

evals_df["faithfulness"].mean()
返回顶部

输出 [ ]

0.6