In [1]:
%pip install -qU \
    langsmith==0.1.125 \
    langchain_openai \
    langchain_huggingface \
    langchain-core==0.2.41 \
    langchain \
    langchain_community \
    langchain-qdrant==0.1.4 \
    langchain-text-splitters \
    langchain-openai \
    langchain_huggingface \
    faiss-cpu \
    langchain-experimental \
    unstructured==0.15.7 \
    python-pptx==1.0.2 \
    nltk==3.9.1 \
    PyMuPDF==1.24.10 \
    ragas==0.1.18 \
    protobuf==3.20.3 \
    pyarrow==14.0.1 \
    fsspec==2024.6.1 \
    sentence_transformers \
    datasets \
    pyarrow==14.0.1


  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for grpcio-tools [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[1154 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_py
  [31m   [0m creating build/lib.macosx-14.0-arm64-cpython-311/grpc_tools
  [31m   [0m copying grpc_tools/command.py -> build/lib.macosx-14.0-arm64-cpython-311/grpc_tools
  [31m   [0m copying grpc_tools/__init__.py -> build/lib.macosx-14.0-arm64-cpython-311/grpc_tools
  [31m   [0m copying grpc_tools/protoc.py -> build/lib.macosx-14.0-arm64-cpython-311/grpc_tools
  [31m   [0m creating build/lib.macosx-14.0-arm64-cpython-311/grpc_tools/_proto/google/protobuf
  [31m   [0m copying grpc_tools/_proto/google/protobuf/wrappers.proto -> build/lib.macosx-14.0-arm64-cpython-311/grpc_tools/_proto/google/protobuf
  [31m   [0

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import os
import getpass
from uuid import uuid4

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")

os.environ["LANGCHAIN_PROJECT"] = "AIM-SDG-MidTerm - AI Safety"
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

os.environ["QDRANT_API_KEY"] = getpass.getpass("Enter Your Qdrant API Key: ")
os.environ["QDRANT_URL"] = getpass.getpass("Enter Your Qdrant URL: ")


## Preparing Training documents

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from enum import Enum
from typing import List
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
import asyncio

class PDFLoaderWrapper():
    class LoaderType(str, Enum):
        PYMUPDF = "pymupdf"

    def __init__(self, file_path: str | List[str] , loader_type: LoaderType = LoaderType.PYMUPDF):
        self.file_path = file_path if isinstance(file_path, list) else [file_path]
        self.loader_type = loader_type

    async def aload(self) -> List[Document]:
        all_docs = []
        for file_path in self.file_path:
            if self.loader_type == self.LoaderType.PYMUPDF:
                try:
                    loader = PyMuPDFLoader(file_path)
                    docs = await loader.aload()
                    all_docs.extend(docs)
                except Exception as e:
                    print(f"Error loading file {file_path}: {e}")
                    continue
        return all_docs


In [None]:

BOR_FILE_PATH = "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
NIST_FILE_PATH = "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
SMALL_DOC = "https://arxiv.org/pdf/1908.10084"
documents_to_preload = [
    BOR_FILE_PATH,
    NIST_FILE_PATH
    # SMALL_DOC
]

pdf_loader = PDFLoaderWrapper(
    documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF
)
documents = await pdf_loader.aload()



In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap  = 50,
    length_function = len
)

In [None]:
training_documents = text_splitter.split_documents(documents.load())

In [None]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [None]:
import random

total_documents = len(training_documents)

# Define the split percentages
train_percent = 0.75  # 75% for training
val_percent = 0.125  # 12.5% for validation
test_percent = 0.125  # 12.5% for testing

# Shuffle the documents
random.shuffle(training_documents)

# Calculate the split indices
train_split = int(total_documents * train_percent)
val_split = int(total_documents * (train_percent + val_percent))

# Split the documents
training_split_documents = training_documents[:train_split]
val_split_documents = training_documents[train_split:val_split]
test_split_documents = training_documents[val_split:]

print(f"Training set: {len(training_split_documents)} documents")
print(f"Validation set: {len(val_split_documents)} documents")
print(f"Test set: {len(test_split_documents)} documents")


## Constructing a Fine-Tuning dataset

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

qa_chat_model = ChatOpenAI(
    model="gpt-4o",
    temperature=0
)

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.
Check internet the question that you generate is realistic and asked by online users and 
include only such questions in the output to be realistic.
You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

question_generation_chain = qa_prompt_template | qa_chat_model

In [None]:
import tqdm
def create_questions(documents, n_questions):
  questions = {}
  relevant_docs = {}
  for document in tqdm.tqdm(documents):
    document_content = {"context" : document.page_content, "questions" : []}
    questions_generated = question_generation_chain.invoke({"context": document.page_content, "n_questions": n_questions})
    for question in questions_generated.content.split("\n"):
      question_id = str(uuid.uuid4())
      questions[question_id] = "".join(question.split(".")[1:]).strip()
      relevant_docs[question_id] = [document.metadata["id"]]
  return questions, relevant_docs

In [None]:
training_questions, training_relevant_contexts = create_questions(training_split_documents,10)
len(training_questions)

In [None]:
val_questions, val_relevant_contexts = create_questions(val_split_documents,5)
len(val_questions)

In [None]:
test_questions, test_relevant_contexts = create_questions(test_split_documents,5)
len(test_questions)

In [None]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

## Fine-tuning `Snowflake/snowflake-arctic-embed-l`

In [None]:
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [None]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

# tourch DataLoader

loader = DataLoader(
    examples
)

# Using MultipleNegativesRankingLoss and MartyoshkaLoss for training

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [None]:
EPOCHS = 5

In [None]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

In [None]:
import pandas as pd

from langchain_community.vectorstores import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document

def evaluate_openai(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
  corpus = dataset['corpus']
  questions = dataset['questions']
  relevant_docs = dataset['relevant_contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embed_model)

  retriever = vectorstore.as_retriever(search_kwargs={"k": top_k})

  eval_results = []
  for id, question in tqdm.tqdm(questions.items()):
    retrieved_nodes = retriever.invoke(question)
    retrieved_ids = [node.metadata["id"] for node in retrieved_nodes]
    expected_id = relevant_docs[id][0]
    is_hit = expected_id in retrieved_ids
    eval_results.append({"id": id, "question": question, "expected_id": expected_id, "is_hit": is_hit})

  return eval_results

We are taking 3 models for comparison
1. text-embedding-3-small
2. Snowflake/snowflake-arctic-embed-l(base)
3. Snowflake/snowflake-arctic-embed-l(fine-tuned)

#### Evaluating `text-embedding-3-small`

In [None]:
te3_openai = OpenAIEmbeddings(model="text-embedding-3-small")
te3_results = evaluate_openai(test_dataset, te3_openai)

In [None]:
te3_results_df = pd.DataFrame(te3_results)
te3_hit_rate = te3_results_df["is_hit"].mean()
te3_hit_rate

#### Evaluating `Snowflake/snowflage-arctic-embed-l`

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-l")
arctic_embed_m_results = evaluate_openai(test_dataset, huggingface_embeddings)

In [None]:
arctic_embed_m_results_df = pd.DataFrame(arctic_embed_m_results)

In [None]:
arctic_embed_m_hit_rate = arctic_embed_m_results_df["is_hit"].mean()
arctic_embed_m_hit_rate

#### Evaluating `Snowflake/snowflage-arctic-embed-l` (fine-tuned)

In [None]:
finetune_embeddings = HuggingFaceEmbeddings(model_name="finetuned_arctic")
finetune_results = evaluate_openai(test_dataset, finetune_embeddings)

In [None]:
finetune_results_df = pd.DataFrame(finetune_results)

In [None]:
finetune_hit_rate = finetune_results_df["is_hit"].mean()
finetune_hit_rate

# Checking the models with rag

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# using the training_documents which already loaded as split documents
def create_rag_chain(huggingface_embeddings):

    vectorstore = FAISS.from_documents(training_documents, huggingface_embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

    RAG_PROMPT = """\
    Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.

    Context:
    {context}

    Question:
    {question}

    Answer:
    """

    rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)

    rag_llm =  ChatOpenAI(
        model="gpt-4o",
        temperature=0
    )

    rag_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": rag_prompt_template | rag_llm | StrOutputParser(), "context": itemgetter("context")}
    )
    return rag_chain

In [None]:
base_rag_chain = create_rag_chain(huggingface_embeddings)
fine_tuned_rag_chain = create_rag_chain(finetune_embeddings)

In [None]:
# Try some questions here

# RAGAS Evaluation

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-4o")
critic_llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

In [None]:
# We are going to use the test_split_documents that we created earlier
testset = generator.generate_with_langchain_docs(test_split_documents, test_size=20, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

In [None]:
testset.to_pandas().head()

In [None]:
from datasets import Dataset

def generate_answers(chain, testset):
  answers = []
  contexts = []
  questions = testset.to_pandas()["question"].values.tolist()
  ground_truths = testset.to_pandas()["ground_truth"].values.tolist()

  for question in tqdm.tqdm(questions):
    answer = chain.invoke({"question" : question})
    answers.append(answer["response"])
    contexts.append([context.page_content for context in answer["context"]])

  return Dataset.from_dict({
      "question" : questions,
      "answer" : answers,
      "contexts" : contexts,
      "ground_truth" : ground_truths
  })

In [None]:
base_dataset = generate_answers(base_rag_chain, testset)
finetune_dataset = generate_answers(fine_tuned_rag_chain, testset)

In [None]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)
from ragas import evaluate

base_result = evaluate(
    base_dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
    ],
)

In [None]:
base_result

In [None]:
base_result.to_pandas().head()

In [None]:
fine_tuned_result = evaluate(
    finetune_dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
    ],
)

In [None]:
fine_tuned_result

In [None]:
fine_tuned_result.to_pandas().head()

In [None]:
# Merge base_result and fine_tuned_result to compare the results
merged_result = base_result.merge(fine_tuned_result, on="question")

merged_result.to_pandas().head()

# Display graphs for the merged results
merged_result.plot()


