πŸ†• Faster agents with parallel tool execution and guardrails & moderation for safer apps. See what's new in Haystack 2.15 🌟

Evaluating AI with Haystack


by Bilge Yucel ( X, Linkedin)

In this cookbook, we walk through the Evaluators in Haystack, create an evaluation pipeline and try different Evaluation Frameworks like FlowJudge.

πŸ“š Useful Resources:

πŸ“Ί Watch Along

!pip install haystack-ai "sentence-transformers>=3.0.0" pypdf "flow-judge[hf]"

1. Building your pipeline

ARAGOG

This dataset is based on the paper Advanced Retrieval Augmented Generation Output Grading (ARAGOG). It’s a collection of papers from ArXiv covering topics around Transformers and Large Language Models, all in PDF format.

The dataset contains:

  • 13 PDF papers.
  • 107 questions and answers generated with the assistance of GPT-4, and validated/corrected by humans.

We have:

  • ground-truth answers
  • questions

Get the dataset here

# Run this to download the dataset
!mkdir -p ARAGOG/papers_for_questions

!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/DetectGPT.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/MMLU_measure.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/PAL.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/bert.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/codenet.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/distilbert.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/glm_130b.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/hellaswag.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/llama.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/llm_long_tail.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/meaning_of_prompt.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/megatron.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/red_teaming.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/roberta.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/superglue.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/task2vec.pdf -P ARAGOG/papers_for_questions

Indexing Pipeline

import os

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PyPDFToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy

embedding_model="sentence-transformers/all-MiniLM-L6-v2"
document_store = InMemoryDocumentStore()

files_path = "./ARAGOG/papers_for_questions"
pipeline = Pipeline()
pipeline.add_component("converter", PyPDFToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", DocumentSplitter(split_length=250, split_by="word"))  # default splitting by word
pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))
pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(embedding_model))
pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")
pdf_files = [files_path + "/" + f_name for f_name in os.listdir(files_path)]

pipeline.run({"converter": {"sources": pdf_files}})
document_store.count_documents()
691

RAG

import os
from getpass import getpass

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass('OPENAI_API_KEY: ')
from haystack import Pipeline
from haystack.components.builders import ChatPromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.dataclasses import ChatMessage

chat_message = ChatMessage.from_user(
    text="""You have to answer the following question based on the given context information only.
If the context is empty or just a '\\n' answer with None, example: "None".

Context:
{% for document in documents %}
  {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""
)

basic_rag = Pipeline()
basic_rag.add_component("query_embedder", SentenceTransformersTextEmbedder(
    model=embedding_model, progress_bar=False
))
basic_rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store))
basic_rag.add_component("chat_prompt_builder", ChatPromptBuilder(template=[chat_message], required_variables="*"))
basic_rag.add_component("chat_generator", OpenAIChatGenerator(model="gpt-4o-mini"))

basic_rag.connect("query_embedder", "retriever.query_embedding")
basic_rag.connect("retriever", "chat_prompt_builder.documents")
basic_rag.connect("chat_prompt_builder", "chat_generator")
<haystack.core.pipeline.pipeline.Pipeline object at 0x309fa13d0>
πŸš… Components
  - query_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - chat_prompt_builder: ChatPromptBuilder
  - chat_generator: OpenAIChatGenerator
πŸ›€οΈ Connections
  - query_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> chat_prompt_builder.documents (List[Document])
  - chat_prompt_builder.prompt -> chat_generator.messages (List[ChatMessage])

2. Human Evaluation

!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/eval_questions.json -P ARAGOG
from typing import List, Tuple
import json

def read_question_answers() -> Tuple[List[str], List[str]]:
    with open("./ARAGOG/eval_questions.json", "r") as f:
        data = json.load(f)
        questions = data["questions"]
        answers = data["ground_truths"]
    return questions, answers

all_questions, all_answers = read_question_answers()
print(len(all_questions))
print(len(all_answers))
107
107
questions = all_questions[:15]
answers = all_answers[:15]
index = 5
print(questions[index])
print(answers[index])
question = questions[index]
How were the questions for the multitask test sourced, and what was the criteria for their inclusion?
Questions were manually collected by graduate and undergraduate students from freely available online sources, including practice questions for standardized tests and undergraduate courses, ensuring a wide representation of difficulty levels and subjects.
basic_rag.run({"query_embedder":{"text":question}, "chat_prompt_builder":{"question": question}})
{'chat_generator': {'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text='The questions for the multitask test were manually collected by graduate and undergraduate students from freely available sources online. These sources included practice questions for tests such as the Graduate Record Examination and the United States Medical Licensing Examination, as well as questions designed for undergraduate courses and readers of Oxford University Press books. The criteria for inclusion involved ensuring that each subject contained a sufficient number of test examples, with each subject having a minimum of 100 test examples. Tasks that were either too challenging for humans without extensive training or too easy for the machine baselines were filtered out.')], _name=None, _meta={'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 110, 'prompt_tokens': 4550, 'total_tokens': 4660, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}})]}}

3. Deciding on Metrics

  • Semantic Answer Similarity: SASEvaluator compares the embedding of a generated answer against a ground-truth answer based on a common embedding model.
  • ContextRelevanceEvaluator will assess the relevancy of the retrieved context to answer the query question
  • FaithfulnessEvaluator evaluates whether the generated answer can be derived from the context

4. Building an Evaluation Pipeline

from haystack import Pipeline
from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator

eval_pipeline = Pipeline()
eval_pipeline.add_component("context_relevance", ContextRelevanceEvaluator(raise_on_failure=False))
eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator(raise_on_failure=False))
eval_pipeline.add_component("sas", SASEvaluator(model=embedding_model))

5. Running Evaluation

Run the RAG Pipeline

predicted_answers = []
retrieved_context = []

for question in questions: # loops over 15 questions
    result = basic_rag.run(
        {"query_embedder":{"text":question}, "chat_prompt_builder":{"question": question}}, include_outputs_from={"retriever"}
    )
    predicted_answers.append(result["chat_generator"]["replies"][0].text)
    retrieved_context.append(result["retriever"]["documents"])

Run the Evaluation

eval_pipeline_results = eval_pipeline.run(
    {
        "context_relevance": {"questions": questions, "contexts": retrieved_context},
        "faithfulness": {"questions": questions, "contexts": retrieved_context, "predicted_answers": predicted_answers},
        "sas": {"predicted_answers": predicted_answers, "ground_truth_answers": answers},
    }
)

results = {
    "context_relevance": eval_pipeline_results['context_relevance'],
    "faithfulness": eval_pipeline_results['faithfulness'],
    "sas": eval_pipeline_results['sas']
}
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 15/15 [00:10<00:00,  1.43it/s]
100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 15/15 [00:33<00:00,  2.23s/it]

6. Analyzing Results

EvaluationRunResult

from haystack.evaluation import EvaluationRunResult

inputs = {
    'questions': questions,
    'contexts': retrieved_context,
    'true_answers': answers,
    'predicted_answers': predicted_answers
}
run_name="rag_eval"
eval_results = EvaluationRunResult(run_name=run_name, inputs=inputs, results=results)
eval_results.aggregated_report()
{'metrics': ['context_relevance', 'faithfulness', 'sas'],
 'score': [0.26666666666666666, 0.7, 0.5344941093275944]}
index = 2
print(eval_pipeline_results['context_relevance']["individual_scores"][index], "\nQuestion:", questions[index],"\nTrue Answer:", answers[index], "\nAnswer:", predicted_answers[index])
print("".join([doc.content for doc in retrieved_context[index]]))

Evaluation Frameworks

from flow_judge.integrations.haystack import HaystackFlowJudge
from flow_judge.metrics.presets import RESPONSE_FAITHFULNESS_5POINT
from flow_judge import Hf

model = Hf(flash_attn=False)

flow_judge_evaluator = HaystackFlowJudge(
    metric=RESPONSE_FAITHFULNESS_5POINT,
    model=model,
    progress_bar=True,
    raise_on_failure=True,
    save_results=True,
    fail_on_parse_error=False
)
import os
from getpass import getpass

if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass('OPENAI_API_KEY: ')
str_fj_retrieved_context = []
for context in retrieved_context:
    str_context = [doc.content for doc in context]
    str_fj_retrieved_context.append(" ".join(str_context)) # ["", "", ...]
from haystack import Pipeline

integration_eval_pipeline = Pipeline()
integration_eval_pipeline.add_component("flow_judge_evaluator", flow_judge_evaluator)

eval_framework_pipeline_results = integration_eval_pipeline.run(
    {
        "flow_judge_evaluator": {"query": questions, "context": str_fj_retrieved_context, "response": predicted_answers},
    }
)
eval_framework_pipeline_results