Evaluating AI with Haystack
Last Updated: July 8, 2025
In this cookbook, we walk through the Evaluators in Haystack, create an evaluation pipeline and try different Evaluation Frameworks like FlowJudge.
π Useful Resources:
- Article: Benchmarking Haystack Pipelines for Optimal Performance
- Evaluation Walkthrough
- Evaluation tutorial
- Evaluation Docs
- haystack-evaluation repo
πΊ Watch Along
!pip install haystack-ai "sentence-transformers>=3.0.0" pypdf "flow-judge[hf]"
1. Building your pipeline
ARAGOG
This dataset is based on the paper Advanced Retrieval Augmented Generation Output Grading (ARAGOG). It’s a collection of papers from ArXiv covering topics around Transformers and Large Language Models, all in PDF format.
The dataset contains:
- 13 PDF papers.
- 107 questions and answers generated with the assistance of GPT-4, and validated/corrected by humans.
We have:
- ground-truth answers
- questions
Get the dataset here
# Run this to download the dataset
!mkdir -p ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/DetectGPT.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/MMLU_measure.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/PAL.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/bert.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/codenet.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/distilbert.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/glm_130b.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/hellaswag.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/llama.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/llm_long_tail.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/meaning_of_prompt.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/megatron.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/red_teaming.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/roberta.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/superglue.pdf -P ARAGOG/papers_for_questions
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/papers_for_questions/task2vec.pdf -P ARAGOG/papers_for_questions
Indexing Pipeline
import os
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PyPDFToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
embedding_model="sentence-transformers/all-MiniLM-L6-v2"
document_store = InMemoryDocumentStore()
files_path = "./ARAGOG/papers_for_questions"
pipeline = Pipeline()
pipeline.add_component("converter", PyPDFToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", DocumentSplitter(split_length=250, split_by="word")) # default splitting by word
pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))
pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(embedding_model))
pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")
pdf_files = [files_path + "/" + f_name for f_name in os.listdir(files_path)]
pipeline.run({"converter": {"sources": pdf_files}})
document_store.count_documents()
691
RAG
import os
from getpass import getpass
if not os.getenv("OPENAI_API_KEY"):
os.environ["OPENAI_API_KEY"] = getpass('OPENAI_API_KEY: ')
from haystack import Pipeline
from haystack.components.builders import ChatPromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators.chat import OpenAIChatGenerator
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.dataclasses import ChatMessage
chat_message = ChatMessage.from_user(
text="""You have to answer the following question based on the given context information only.
If the context is empty or just a '\\n' answer with None, example: "None".
Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}
Question: {{question}}
Answer:
"""
)
basic_rag = Pipeline()
basic_rag.add_component("query_embedder", SentenceTransformersTextEmbedder(
model=embedding_model, progress_bar=False
))
basic_rag.add_component("retriever", InMemoryEmbeddingRetriever(document_store))
basic_rag.add_component("chat_prompt_builder", ChatPromptBuilder(template=[chat_message], required_variables="*"))
basic_rag.add_component("chat_generator", OpenAIChatGenerator(model="gpt-4o-mini"))
basic_rag.connect("query_embedder", "retriever.query_embedding")
basic_rag.connect("retriever", "chat_prompt_builder.documents")
basic_rag.connect("chat_prompt_builder", "chat_generator")
<haystack.core.pipeline.pipeline.Pipeline object at 0x309fa13d0>
π
Components
- query_embedder: SentenceTransformersTextEmbedder
- retriever: InMemoryEmbeddingRetriever
- chat_prompt_builder: ChatPromptBuilder
- chat_generator: OpenAIChatGenerator
π€οΈ Connections
- query_embedder.embedding -> retriever.query_embedding (List[float])
- retriever.documents -> chat_prompt_builder.documents (List[Document])
- chat_prompt_builder.prompt -> chat_generator.messages (List[ChatMessage])
2. Human Evaluation
!wget https://raw.githubusercontent.com/deepset-ai/haystack-evaluation/main/datasets/ARAGOG/eval_questions.json -P ARAGOG
from typing import List, Tuple
import json
def read_question_answers() -> Tuple[List[str], List[str]]:
with open("./ARAGOG/eval_questions.json", "r") as f:
data = json.load(f)
questions = data["questions"]
answers = data["ground_truths"]
return questions, answers
all_questions, all_answers = read_question_answers()
print(len(all_questions))
print(len(all_answers))
107
107
questions = all_questions[:15]
answers = all_answers[:15]
index = 5
print(questions[index])
print(answers[index])
question = questions[index]
How were the questions for the multitask test sourced, and what was the criteria for their inclusion?
Questions were manually collected by graduate and undergraduate students from freely available online sources, including practice questions for standardized tests and undergraduate courses, ensuring a wide representation of difficulty levels and subjects.
basic_rag.run({"query_embedder":{"text":question}, "chat_prompt_builder":{"question": question}})
{'chat_generator': {'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text='The questions for the multitask test were manually collected by graduate and undergraduate students from freely available sources online. These sources included practice questions for tests such as the Graduate Record Examination and the United States Medical Licensing Examination, as well as questions designed for undergraduate courses and readers of Oxford University Press books. The criteria for inclusion involved ensuring that each subject contained a sufficient number of test examples, with each subject having a minimum of 100 test examples. Tasks that were either too challenging for humans without extensive training or too easy for the machine baselines were filtered out.')], _name=None, _meta={'model': 'gpt-4o-mini-2024-07-18', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 110, 'prompt_tokens': 4550, 'total_tokens': 4660, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}})]}}
3. Deciding on Metrics
- Semantic Answer Similarity: SASEvaluator compares the embedding of a generated answer against a ground-truth answer based on a common embedding model.
- ContextRelevanceEvaluator will assess the relevancy of the retrieved context to answer the query question
- FaithfulnessEvaluator evaluates whether the generated answer can be derived from the context
4. Building an Evaluation Pipeline
from haystack import Pipeline
from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator
eval_pipeline = Pipeline()
eval_pipeline.add_component("context_relevance", ContextRelevanceEvaluator(raise_on_failure=False))
eval_pipeline.add_component("faithfulness", FaithfulnessEvaluator(raise_on_failure=False))
eval_pipeline.add_component("sas", SASEvaluator(model=embedding_model))
5. Running Evaluation
Run the RAG Pipeline
predicted_answers = []
retrieved_context = []
for question in questions: # loops over 15 questions
result = basic_rag.run(
{"query_embedder":{"text":question}, "chat_prompt_builder":{"question": question}}, include_outputs_from={"retriever"}
)
predicted_answers.append(result["chat_generator"]["replies"][0].text)
retrieved_context.append(result["retriever"]["documents"])
Run the Evaluation
eval_pipeline_results = eval_pipeline.run(
{
"context_relevance": {"questions": questions, "contexts": retrieved_context},
"faithfulness": {"questions": questions, "contexts": retrieved_context, "predicted_answers": predicted_answers},
"sas": {"predicted_answers": predicted_answers, "ground_truth_answers": answers},
}
)
results = {
"context_relevance": eval_pipeline_results['context_relevance'],
"faithfulness": eval_pipeline_results['faithfulness'],
"sas": eval_pipeline_results['sas']
}
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:10<00:00, 1.43it/s]
100%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ| 15/15 [00:33<00:00, 2.23s/it]
6. Analyzing Results
from haystack.evaluation import EvaluationRunResult
inputs = {
'questions': questions,
'contexts': retrieved_context,
'true_answers': answers,
'predicted_answers': predicted_answers
}
run_name="rag_eval"
eval_results = EvaluationRunResult(run_name=run_name, inputs=inputs, results=results)
eval_results.aggregated_report()
{'metrics': ['context_relevance', 'faithfulness', 'sas'],
'score': [0.26666666666666666, 0.7, 0.5344941093275944]}
index = 2
print(eval_pipeline_results['context_relevance']["individual_scores"][index], "\nQuestion:", questions[index],"\nTrue Answer:", answers[index], "\nAnswer:", predicted_answers[index])
print("".join([doc.content for doc in retrieved_context[index]]))
Evaluation Frameworks
- For RagasEvaluator check out our cookbook RAG Pipeline Evaluation Using RAGAS
- Here we will show how to use FlowJudge
from flow_judge.integrations.haystack import HaystackFlowJudge
from flow_judge.metrics.presets import RESPONSE_FAITHFULNESS_5POINT
from flow_judge import Hf
model = Hf(flash_attn=False)
flow_judge_evaluator = HaystackFlowJudge(
metric=RESPONSE_FAITHFULNESS_5POINT,
model=model,
progress_bar=True,
raise_on_failure=True,
save_results=True,
fail_on_parse_error=False
)
import os
from getpass import getpass
if not os.getenv("OPENAI_API_KEY"):
os.environ["OPENAI_API_KEY"] = getpass('OPENAI_API_KEY: ')
str_fj_retrieved_context = []
for context in retrieved_context:
str_context = [doc.content for doc in context]
str_fj_retrieved_context.append(" ".join(str_context)) # ["", "", ...]
from haystack import Pipeline
integration_eval_pipeline = Pipeline()
integration_eval_pipeline.add_component("flow_judge_evaluator", flow_judge_evaluator)
eval_framework_pipeline_results = integration_eval_pipeline.run(
{
"flow_judge_evaluator": {"query": questions, "context": str_fj_retrieved_context, "response": predicted_answers},
}
)
eval_framework_pipeline_results