File size: 2,648 Bytes
7b666bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
import torch
from transformers import pipeline

# Load a smaller LLM (e.g., Zephyr-7B or Mistral-7B)
def load_llm():
    model_name = "HuggingFaceH4/zephyr-7b-alpha"  # Replace with your preferred model
    pipe = pipeline("text-generation", model=model_name, torch_dtype=torch.float16, device_map="auto")
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

# Extract text from PDF
def extract_text_from_pdf(file):
    reader = PdfReader(file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Split text into chunks
def split_text(text, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_text(text)
    return chunks

# Create embeddings and vector store
def create_vector_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vector_store = FAISS.from_texts(chunks, embeddings)
    return vector_store

# Query the PDF
def query_pdf(vector_store, query, llm):
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())
    result = qa.run(query)
    return result

# Streamlit App
def main():
    st.title("Chat with PDF")
    st.write("Upload a PDF and ask questions about it!")

    # File uploader
    uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
    if uploaded_file is None:
        st.info("Using default PDF.")
        uploaded_file = "default.pdf"  # Add a default PDF

    # Extract text
    text = extract_text_from_pdf(uploaded_file)

    # Split text into chunks
    chunks = split_text(text)

    # Create vector store
    vector_store = create_vector_store(chunks)

    # Load LLM
    llm = load_llm()

    # Query translation options
    query_method = st.selectbox(
        "Query Translation Method",
        ["Multi-Query", "RAG Fusion", "Decomposition", "Step Back", "HyDE"],
        help="Choose a method to improve query retrieval."
    )

    # User input
    query = st.text_input("Ask a question about the PDF:")
    if query:
        # Query the PDF
        result = query_pdf(vector_store, query, llm)
        st.write("**Answer:**", result["answer"])
        st.write("**Source Text:**", result["source_text"])

if __name__ == "__main__":
    main()