Bot_RAG / app.py
pradeepsengarr's picture
Update app.py
cb0ff81 verified
raw
history blame
4.24 kB
import os
import shutil
import tempfile
import fitz # PyMuPDF
import streamlit as st
import logging
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import TextLoader
# --- Streamlit Config ---
st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
st.title("πŸ“š RAG-based PDF Chatbot")
# --- Logging ---
logging.basicConfig(level=logging.INFO)
# --- Load Model ---
@st.cache_resource
def load_model():
checkpoint = "MBZUAI/LaMini-T5-738M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
return HuggingFacePipeline(pipeline=pipe)
# --- Extract PDF Text ---
def extract_text_from_pdf(file):
try:
doc = fitz.open(stream=file.read(), filetype="pdf")
return "\n".join([page.get_text() for page in doc])
except Exception as e:
logging.error(f"Error reading PDF: {e}")
return ""
# --- Create Chroma Vectorstore Safely ---
def create_vectorstore(documents, embeddings):
temp_dir = tempfile.mkdtemp() # unique, writable temp dir
db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
return db
# --- Build RAG QA Chain ---
def build_qa_chain(retriever, llm):
prompt_template = PromptTemplate(
input_variables=["context", "question"],
template="""
You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible.
Context:
{context}
Question:
{question}
Helpful Answer:
"""
)
return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template})
# --- Process QA ---
def process_question(question, full_text):
# Write PDF text to temp file
with open("temp_text.txt", "w") as f:
f.write(full_text)
loader = TextLoader("temp_text.txt")
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = text_splitter.split_documents(docs)
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = create_vectorstore(chunks, embeddings)
retriever = vectorstore.as_retriever()
llm = load_model()
qa = build_qa_chain(retriever, llm)
return qa.run(question)
# --- Sidebar Upload ---
with st.sidebar:
st.header("πŸ“„ Upload your PDF")
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
# --- Main Logic ---
if uploaded_file:
st.success(f"Uploaded: {uploaded_file.name}")
full_text = extract_text_from_pdf(uploaded_file)
if full_text:
with st.expander("πŸ“„ View Extracted PDF Text", expanded=False):
st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
st.subheader("πŸ’¬ Ask Something")
user_question = st.text_input("Ask a question about the document")
if user_question:
with st.spinner("Analyzing..."):
try:
answer = process_question(user_question, full_text)
except Exception as e:
st.error("⚠️ Something went wrong. Try re-uploading the PDF.")
st.stop()
st.markdown("### πŸ€– Answer")
st.write(answer)
with st.sidebar:
st.markdown("---")
st.caption("πŸ’‘ Sample Questions")
st.markdown("""
- "Summarize the document"
- "What is the experience of Pradeep Singh Sengar?"
- "What are the key points?"
- "Explain in short"
""")
else:
st.error("❌ Could not extract text. Try a different PDF.")
else:
st.info("Upload a PDF to get started.")