Evaluation of a QA System

EXECUTABLE VERSION: colab

To be able to make a statement about the performance of a question-answering system, it is important to evalute it. Furthermore, evaluation allows to determine which parts of the system can be improved.

Start an Elasticsearch server

You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.

# Install the latest release of Haystack in your own environment 
#! pip install farm-haystack

# Install the latest master of Haystack and install the version of torch that works with the colab GPUs
!pip install git+https://github.com/deepset-ai/haystack.git
!pip install torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.6.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.6.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.6.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30
from farm.utils import initialize_device_settings

device, n_gpu = initialize_device_settings(use_cuda=True)
from haystack.preprocessor.utils import fetch_archive_from_http

# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents
doc_dir = "../data/nq"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
# make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted
doc_index = "tutorial5_docs"
label_index = "tutorial5_labels"
# Connect to Elasticsearch
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore

# Connect to Elasticsearch
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document",
                                            create_index=False, embedding_field="emb",
                                            embedding_dim=768, excluded_meta_data=["emb"])
# Add evaluation data to Elasticsearch Document Store
# We first delete the custom tutorial indices to not have duplicate elements
document_store.delete_all_documents(index=doc_index)
document_store.delete_all_documents(index=label_index)
document_store.add_eval_data(filename="../data/nq/nq_dev_subset_v2.json", doc_index=doc_index, label_index=label_index)

Initialize components of QA-System

# Initialize Retriever
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)
# Alternative: Evaluate DensePassageRetriever
# Note, that DPR works best when you index short passages < 512 tokens as only those tokens will be used for the embedding.
# Here, for nq_dev_subset_v2.json we have avg. num of tokens = 5220(!).
# DPR still outperforms Elastic's BM25 by a small margin here.
# from haystack.retriever.dense import DensePassageRetriever
# retriever = DensePassageRetriever(document_store=document_store,
#                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
#                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
#                                  use_gpu=True,
#                                  embed_title=True,
#                                  max_seq_len=256,
#                                  batch_size=16,
#                                  remove_sep_tok_from_untitled_passages=True)
#document_store.update_embeddings(retriever, index=doc_index)
# Initialize Reader
from haystack.reader.farm import FARMReader

reader = FARMReader("deepset/roberta-base-squad2", top_k_per_candidate=4)
# Initialize Finder which sticks together Reader and Retriever
from haystack.finder import Finder

finder = Finder(reader, retriever)

Evaluation of Retriever

## Evaluate Retriever on its own
retriever_eval_results = retriever.eval(top_k=20, label_index=label_index, doc_index=doc_index)
## Retriever Recall is the proportion of questions for which the correct document containing the answer is
## among the correct documents
print("Retriever Recall:", retriever_eval_results["recall"])
## Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank
print("Retriever Mean Avg Precision:", retriever_eval_results["map"])

Evaluation of Reader

# Evaluate Reader on its own
reader_eval_results = reader.eval(document_store=document_store, device=device, label_index=label_index, doc_index=doc_index)
# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
#reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)

## Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer
print("Reader Top-N-Accuracy:", reader_eval_results["top_n_accuracy"])
## Reader Exact Match is the proportion of questions where the predicted answer is exactly the same as the correct answer
print("Reader Exact Match:", reader_eval_results["EM"])
## Reader F1-Score is the average overlap between the predicted answers and the correct answers
print("Reader F1-Score:", reader_eval_results["f1"])

Evaluation of Finder

# Evaluate combination of Reader and Retriever through Finder
# Evaluate combination of Reader and Retriever through Finder
finder_eval_results = finder.eval(top_k_retriever=1, top_k_reader=10, label_index=label_index, doc_index=doc_index)
finder.print_eval_results(finder_eval_results)
© 2020 - 2021 deepset. All rights reserved.Imprint