RAG with Llama 3.1


      

Simple RAG example on the Oscars using Llama 3.1 open models and the Haystack LLM framework.

Installation

! pip install haystack-ai "transformers>=4.43.1" sentence-transformers accelerate bitsandbytes

Authorization

import getpass, os


os.environ["HF_API_TOKEN"] = getpass.getpass("Your Hugging Face token")

RAG with Llama-3.1-8B-Instruct (about the Oscars) 🏆🎬

! pip install wikipedia

Load data from Wikipedia

from IPython.display import Image
from pprint import pprint
import rich
import random
import wikipedia
from haystack.dataclasses import Document

title = "96th_Academy_Awards"
page = wikipedia.page(title=title, auto_suggest=False)
raw_docs = [Document(content=page.content, meta={"title": page.title, "url":page.url})]

Indexing Pipeline

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack import Document
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder
from haystack.components.converters import TextFileToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter
from haystack.utils import ComponentDevice
document_store = InMemoryDocumentStore()
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("splitter", DocumentSplitter(split_by="word", split_length=200))

indexing_pipeline.add_component(
    "embedder",
    SentenceTransformersDocumentEmbedder(
        model="Snowflake/snowflake-arctic-embed-l",  # good embedding model: https://huggingface.co/Snowflake/snowflake-arctic-embed-l
        device=ComponentDevice.from_str("cuda:0"),    # load the model on GPU
    ))
indexing_pipeline.add_component("writer", DocumentWriter(document_store=document_store))

# connect the components
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "writer")
indexing_pipeline.run({"splitter":{"documents":raw_docs}})

RAG Pipeline

from haystack.components.builders import PromptBuilder

prompt_template = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>


Using the information contained in the context, give a comprehensive answer to the question.
If the answer cannot be deduced from the context, do not give an answer.

Context:
  {% for doc in documents %}
  {{ doc.content }} URL:{{ doc.meta['url'] }}
  {% endfor %};
  Question: {{query}}<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>


"""
prompt_builder = PromptBuilder(template=prompt_template)

Here, we use the HuggingFaceLocalGenerator, loading the model in Colab with 4-bit quantization.

import torch
from haystack.components.generators import HuggingFaceLocalGenerator

generator = HuggingFaceLocalGenerator(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    huggingface_pipeline_kwargs={"device_map":"auto",
                                  "model_kwargs":{"load_in_4bit":True,
                                                  "bnb_4bit_use_double_quant":True,
                                                  "bnb_4bit_quant_type":"nf4",
                                                  "bnb_4bit_compute_dtype":torch.bfloat16}},
    generation_kwargs={"max_new_tokens": 500})

generator.warm_up()
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

query_pipeline = Pipeline()

query_pipeline.add_component(
    "text_embedder",
    SentenceTransformersTextEmbedder(
        model="Snowflake/snowflake-arctic-embed-l",  # good embedding model: https://huggingface.co/Snowflake/snowflake-arctic-embed-l
        device=ComponentDevice.from_str("cuda:0"),  # load the model on GPU
        prefix="Represent this sentence for searching relevant passages: ",  # as explained in the model card (https://huggingface.co/Snowflake/snowflake-arctic-embed-l#using-huggingface-transformers), queries should be prefixed
    ))
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=5))
query_pipeline.add_component("prompt_builder", PromptBuilder(template=prompt_template))
query_pipeline.add_component("generator", generator)

# connect the components
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever.documents", "prompt_builder.documents")
query_pipeline.connect("prompt_builder", "generator")

Let’s ask some questions!

def get_generative_answer(query):

  results = query_pipeline.run({
      "text_embedder": {"text": query},
      "prompt_builder": {"query": query}
    }
  )

  answer = results["generator"]["replies"][0]
  rich.print(answer)
get_generative_answer("Who won the Best Picture Award in 2024?")
get_generative_answer("What was the box office performance of the Best Picture nominees?")
get_generative_answer("What was the reception of the ceremony")
get_generative_answer("Can you name some of the films that got multiple nominations?")
# unrelated question: let's see how our RAG pipeline performs.

get_generative_answer("Audioslave was formed by members of two iconic bands. Can you name the bands and discuss the sound of Audioslave in comparison?")

This is a simple demo. We can improve the RAG Pipeline in several ways, including better preprocessing the input.

To use Llama 3 models in Haystack, you also have other options:

  • LlamaCppGenerator and OllamaGenerator: using the GGUF quantized format, these solutions are ideal to run LLMs on standard machines (even without GPUs).
  • HuggingFaceAPIGenerator, which allows you to query a local TGI container or a (paid) HF Inference Endpoint. TGI is a toolkit for efficiently deploying and serving LLMs in production.
  • vLLM via OpenAIGenerator: high-throughput and memory-efficient inference and serving engine for LLMs.

(Notebook by Stefano Fiorucci)