Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on 16 days ago

Commit

528bb27

verified ·

1 Parent(s): cb0ff81

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -86

app.py CHANGED Viewed

@@ -1,127 +1,118 @@
 import os
-import shutil
-import tempfile
-import fitz  # PyMuPDF
 import streamlit as st
-import logging
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
-from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain.chains import RetrievalQA
 from langchain_community.llms import HuggingFacePipeline
 from langchain.prompts import PromptTemplate
-from langchain_community.document_loaders import TextLoader
-# --- Streamlit Config ---
-st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
-st.title("📚 RAG-based PDF Chatbot")
-# --- Logging ---
-logging.basicConfig(level=logging.INFO)
-# --- Load Model ---
 @st.cache_resource
-def load_model():
     checkpoint = "MBZUAI/LaMini-T5-738M"
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
     pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
     return HuggingFacePipeline(pipeline=pipe)
-# --- Extract PDF Text ---
-def extract_text_from_pdf(file):
     try:
-        doc = fitz.open(stream=file.read(), filetype="pdf")
-        return "\n".join([page.get_text() for page in doc])
     except Exception as e:
-        logging.error(f"Error reading PDF: {e}")
         return ""
-# --- Create Chroma Vectorstore Safely ---
-def create_vectorstore(documents, embeddings):
-    temp_dir = tempfile.mkdtemp()  # unique, writable temp dir
-    db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
-    return db
-# --- Build RAG QA Chain ---
-def build_qa_chain(retriever, llm):
-    prompt_template = PromptTemplate(
         input_variables=["context", "question"],
-        template="""
-You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible.
-Context:
-{context}
-Question:
-{question}
-Helpful Answer:
-"""
     )
-    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template})
-# --- Process QA ---
-def process_question(question, full_text):
-    # Write PDF text to temp file
-    with open("temp_text.txt", "w") as f:
-        f.write(full_text)
-    loader = TextLoader("temp_text.txt")
-    docs = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
-    chunks = text_splitter.split_documents(docs)
-    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    vectorstore = create_vectorstore(chunks, embeddings)
-    retriever = vectorstore.as_retriever()
-    llm = load_model()
-    qa = build_qa_chain(retriever, llm)
-    return qa.run(question)
-# --- Sidebar Upload ---
 with st.sidebar:
-    st.header("📄 Upload your PDF")
-    uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
-# --- Main Logic ---
-if uploaded_file:
-    st.success(f"Uploaded: {uploaded_file.name}")
-    full_text = extract_text_from_pdf(uploaded_file)
     if full_text:
-        with st.expander("📄 View Extracted PDF Text", expanded=False):
             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
-        st.subheader("💬 Ask Something")
-        user_question = st.text_input("Ask a question about the document")
-        if user_question:
-            with st.spinner("Analyzing..."):
-                try:
-                    answer = process_question(user_question, full_text)
-                except Exception as e:
-                    st.error("⚠️ Something went wrong. Try re-uploading the PDF.")
-                    st.stop()
                 st.markdown("### 🤖 Answer")
                 st.write(answer)
-        with st.sidebar:
-            st.markdown("---")
-            st.caption("💡 Sample Questions")
-            st.markdown("""
-            - "Summarize the document"
-            - "What is the experience of Pradeep Singh Sengar?"
-            - "What are the key points?"
-            - "Explain in short"
-            """)
     else:
-        st.error("❌ Could not extract text. Try a different PDF.")
 else:
-    st.info("Upload a PDF to get started.")

 import os
 import streamlit as st
+import fitz  # PyMuPDF
+import tempfile
+import shutil
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain.chains import RetrievalQA
 from langchain_community.llms import HuggingFacePipeline
 from langchain.prompts import PromptTemplate
+# --- Streamlit Setup ---
+st.set_page_config(page_title="📚 Accurate RAG PDF Chatbot", layout="wide")
+st.title("📚 Accurate RAG-based PDF Chatbot")
+# --- Load LLM (You can swap with Phi-2 or Mistral 7B later) ---
 @st.cache_resource
+def load_llm():
     checkpoint = "MBZUAI/LaMini-T5-738M"
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
     pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
     return HuggingFacePipeline(pipeline=pipe)
+# --- Load Embeddings ---
+@st.cache_resource
+def load_embeddings():
+    return HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+# --- PDF Text Extraction ---
+def extract_text_from_pdf(uploaded_file):
     try:
+        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
+        full_text = ""
+        for page in doc:
+            full_text += page.get_text()
+        return full_text.strip()
     except Exception as e:
+        st.error(f"❌ Error reading PDF: {e}")
         return ""
+# --- Text Chunking ---
+def chunk_text(full_text):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
+    return splitter.create_documents([full_text])
+# --- Vectorstore Setup (with in-memory temp directory) ---
+def build_vectorstore(chunks, embeddings):
+    temp_dir = os.path.join(tempfile.gettempdir(), "chromadb-rag")
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+    os.makedirs(temp_dir, exist_ok=True)
+    return Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=temp_dir)
+# --- Prompt Template ---
+def get_prompt_template():
+    return PromptTemplate(
         input_variables=["context", "question"],
+        template=(
+            "You are a helpful assistant. Answer the question based only on the following context.\n\n"
+            "Context:\n{context}\n\n"
+            "Question: {question}\n\n"
+            "Answer (Be accurate and concise):"
+        )
     )
+# --- Answering Logic ---
+def get_answer(question, full_text):
+    if not question or not full_text:
+        return "⚠️ Please provide both PDF and a question."
+    chunks = chunk_text(full_text)
+    embeddings = load_embeddings()
+    vectorstore = build_vectorstore(chunks, embeddings)
+    retriever = vectorstore.as_retriever()
+    llm = load_llm()
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        chain_type="stuff",
+        chain_type_kwargs={"prompt": get_prompt_template()}
+    )
+    return qa_chain.run(question)
+# --- UI ---
 with st.sidebar:
+    st.header("📄 Upload PDF")
+    uploaded_pdf = st.file_uploader("Upload your PDF", type=["pdf"])
+if uploaded_pdf:
+    st.success(f"✅ Uploaded: {uploaded_pdf.name}")
+    full_text = extract_text_from_pdf(uploaded_pdf)
     if full_text:
+        with st.expander("📄 Preview PDF Text", expanded=False):
             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
+        question = st.text_input("❓ Ask a question about this PDF")
+        if question:
+            with st.spinner("💭 Generating answer..."):
+                answer = get_answer(question, full_text)
                 st.markdown("### 🤖 Answer")
                 st.write(answer)
     else:
+        st.error("⚠️ Could not extract any text from the PDF.")
 else:
+    st.info("📥 Upload a PDF to start.")
+with st.sidebar:
+    st.markdown("---")
+    st.markdown("💡 Try questions like:")
+    st.caption("• What are the key ideas?\n• Summarize the document\n• What is Pradeep Singh Sengar's experience?")