File size: 4,240 Bytes
709f6b7
cb0ff81
 
6ccf2cb
cb0ff81
8d47fc3
cb0ff81
709f6b7
 
 
bbd8a88
709f6b7
cb0ff81
6b52351
cb0ff81
709f6b7
8d47fc3
bbd8a88
 
3875c87
bbd8a88
 
709f6b7
cb0ff81
bbd8a88
cb0ff81
bbd8a88
 
 
dea11f3
bbd8a88
 
cb0ff81
8d47fc3
3875c87
bbd8a88
cb0ff81
3875c87
8d47fc3
bbd8a88
6956d92
cb0ff81
 
 
 
8d47fc3
 
cb0ff81
 
6b52351
 
 
cb0ff81
6b52351
 
 
 
 
 
 
cb0ff81
 
6b52351
cb0ff81
6b52351
cb0ff81
 
 
 
 
920b3d6
cb0ff81
 
 
 
 
 
 
 
 
 
 
 
8d47fc3
7c797e6
cb0ff81
bbd8a88
cb0ff81
 
bbd8a88
cb0ff81
bbd8a88
8d47fc3
 
bbd8a88
 
cb0ff81
bbd8a88
 
cb0ff81
 
dea11f3
bbd8a88
cb0ff81
 
 
 
 
 
bbd8a88
 
 
 
 
cb0ff81
39d36c9
cb0ff81
 
 
 
39d36c9
bbd8a88
cb0ff81
3875c87
cb0ff81
 
e428e3e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import shutil
import tempfile
import fitz  # PyMuPDF
import streamlit as st
import logging

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader

# --- Streamlit Config ---
st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
st.title("πŸ“š RAG-based PDF Chatbot")

# --- Logging ---
logging.basicConfig(level=logging.INFO)

# --- Load Model ---
@st.cache_resource
def load_model():
    checkpoint = "MBZUAI/LaMini-T5-738M"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
    return HuggingFacePipeline(pipeline=pipe)

# --- Extract PDF Text ---
def extract_text_from_pdf(file):
    try:
        doc = fitz.open(stream=file.read(), filetype="pdf")
        return "\n".join([page.get_text() for page in doc])
    except Exception as e:
        logging.error(f"Error reading PDF: {e}")
        return ""

# --- Create Chroma Vectorstore Safely ---
def create_vectorstore(documents, embeddings):
    temp_dir = tempfile.mkdtemp()  # unique, writable temp dir
    db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
    return db

# --- Build RAG QA Chain ---
def build_qa_chain(retriever, llm):
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="""
You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible.

Context:
{context}

Question:
{question}

Helpful Answer:
"""
    )
    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template})

# --- Process QA ---
def process_question(question, full_text):
    # Write PDF text to temp file
    with open("temp_text.txt", "w") as f:
        f.write(full_text)

    loader = TextLoader("temp_text.txt")
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    chunks = text_splitter.split_documents(docs)

    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = create_vectorstore(chunks, embeddings)
    retriever = vectorstore.as_retriever()

    llm = load_model()
    qa = build_qa_chain(retriever, llm)
    return qa.run(question)

# --- Sidebar Upload ---
with st.sidebar:
    st.header("πŸ“„ Upload your PDF")
    uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])

# --- Main Logic ---
if uploaded_file:
    st.success(f"Uploaded: {uploaded_file.name}")
    full_text = extract_text_from_pdf(uploaded_file)

    if full_text:
        with st.expander("πŸ“„ View Extracted PDF Text", expanded=False):
            st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))

        st.subheader("πŸ’¬ Ask Something")
        user_question = st.text_input("Ask a question about the document")

        if user_question:
            with st.spinner("Analyzing..."):
                try:
                    answer = process_question(user_question, full_text)
                except Exception as e:
                    st.error("⚠️ Something went wrong. Try re-uploading the PDF.")
                    st.stop()
                st.markdown("### πŸ€– Answer")
                st.write(answer)

        with st.sidebar:
            st.markdown("---")
            st.caption("πŸ’‘ Sample Questions")
            st.markdown("""
            - "Summarize the document"
            - "What is the experience of Pradeep Singh Sengar?"
            - "What are the key points?"
            - "Explain in short"
            """)
    else:
        st.error("❌ Could not extract text. Try a different PDF.")
else:
    st.info("Upload a PDF to get started.")