import os import streamlit as st import fitz # PyMuPDF import logging from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_community.embeddings import SentenceTransformerEmbeddings from langchain_community.llms import HuggingFacePipeline from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain_community.document_loaders import TextLoader # --- Configuration --- st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide") st.title("📚 RAG-based PDF Chatbot") device = "cpu" # --- Logging --- logging.basicConfig(level=logging.INFO) # --- Load LLM --- @st.cache_resource def load_model(): checkpoint = "MBZUAI/LaMini-T5-738M" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95) return HuggingFacePipeline(pipeline=pipe) # --- Extract PDF Text --- def read_pdf(file): try: doc = fitz.open(stream=file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() return text.strip() except Exception as e: logging.error(f"Failed to extract text: {e}") return "" # --- Process Answer --- def process_answer(question, full_text): # Save the full_text to a temporary file with open("temp_text.txt", "w") as f: f.write(full_text) loader = TextLoader("temp_text.txt") docs = loader.load() # Chunk the documents with increased size and overlap text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300) splits = text_splitter.split_documents(docs) # Load embeddings embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # Create Chroma in-memory vector store db = Chroma.from_documents(splits, embedding=embeddings) retriever = db.as_retriever() # Set up the model llm = load_model() # Create a custom prompt prompt_template = PromptTemplate.from_template(""" You are a helpful assistant. Use the following context to answer the question as accurately and thoroughly as possible. Context: {context} Question: {question} Answer in detail:""") # Retrieval QA with custom prompt qa_chain = RetrievalQA.from_chain_type( llm=llm, retriever=retriever, chain_type="stuff", chain_type_kwargs={"prompt": prompt_template} ) # Return the answer using the retrieval QA chain return qa_chain.run(question) # --- UI Layout --- with st.sidebar: st.header("📄 Upload PDF") uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"]) # --- Main Interface --- if uploaded_file: st.success(f"You uploaded: {uploaded_file.name}") full_text = read_pdf(uploaded_file) if full_text: st.subheader("📁 PDF Preview") with st.expander("View Extracted Text"): st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else "")) st.subheader("💬 Ask a Question") user_question = st.text_input("Type your question about the PDF content") if user_question: with st.spinner("Thinking..."): answer = process_answer(user_question, full_text) st.markdown("### 🤖 Answer") st.write(answer) with st.sidebar: st.markdown("---") st.markdown("**💡 Suggestions:**") st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"") with st.expander("💡 Suggestions", expanded=True): st.markdown(""" - "Summarize this document" - "Give a quick summary" - "What are the main points?" - "Explain this document in short" """) else: st.error("⚠️ No text could be extracted from the PDF. Try another file.") else: st.info("Upload a PDF to begin.")