import os import shutil import tempfile import fitz # PyMuPDF import streamlit as st import logging from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_community.embeddings import SentenceTransformerEmbeddings from langchain.chains import RetrievalQA from langchain_community.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate from langchain_community.document_loaders import TextLoader # --- Streamlit Config --- st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide") st.title("📚 RAG-based PDF Chatbot") # --- Logging --- logging.basicConfig(level=logging.INFO) # --- Load Model --- @st.cache_resource def load_model(): checkpoint = "MBZUAI/LaMini-T5-738M" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512) return HuggingFacePipeline(pipeline=pipe) # --- Extract PDF Text --- def extract_text_from_pdf(file): try: doc = fitz.open(stream=file.read(), filetype="pdf") return "\n".join([page.get_text() for page in doc]) except Exception as e: logging.error(f"Error reading PDF: {e}") return "" # --- Create Chroma Vectorstore Safely --- def create_vectorstore(documents, embeddings): temp_dir = tempfile.mkdtemp() # unique, writable temp dir db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir) return db # --- Build RAG QA Chain --- def build_qa_chain(retriever, llm): prompt_template = PromptTemplate( input_variables=["context", "question"], template=""" You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible. Context: {context} Question: {question} Helpful Answer: """ ) return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template}) # --- Process QA --- def process_question(question, full_text): # Write PDF text to temp file with open("temp_text.txt", "w") as f: f.write(full_text) loader = TextLoader("temp_text.txt") docs = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) chunks = text_splitter.split_documents(docs) embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") vectorstore = create_vectorstore(chunks, embeddings) retriever = vectorstore.as_retriever() llm = load_model() qa = build_qa_chain(retriever, llm) return qa.run(question) # --- Sidebar Upload --- with st.sidebar: st.header("📄 Upload your PDF") uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) # --- Main Logic --- if uploaded_file: st.success(f"Uploaded: {uploaded_file.name}") full_text = extract_text_from_pdf(uploaded_file) if full_text: with st.expander("📄 View Extracted PDF Text", expanded=False): st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else "")) st.subheader("💬 Ask Something") user_question = st.text_input("Ask a question about the document") if user_question: with st.spinner("Analyzing..."): try: answer = process_question(user_question, full_text) except Exception as e: st.error("⚠️ Something went wrong. Try re-uploading the PDF.") st.stop() st.markdown("### 🤖 Answer") st.write(answer) with st.sidebar: st.markdown("---") st.caption("💡 Sample Questions") st.markdown(""" - "Summarize the document" - "What is the experience of Pradeep Singh Sengar?" - "What are the key points?" - "Explain in short" """) else: st.error("❌ Could not extract text. Try a different PDF.") else: st.info("Upload a PDF to get started.")