Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Apr 16

Commit

8d47fc3

verified ·

1 Parent(s): e428e3e

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -78

app.py CHANGED Viewed

@@ -420,9 +420,9 @@
 import os
 import streamlit as st
 import fitz  # PyMuPDF
-import logging
 import tempfile
 import shutil
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
@@ -430,132 +430,125 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
 from langchain_community.document_loaders import TextLoader
-# --- Configuration ---
 st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
 st.title("📚 RAG-based PDF Chatbot")
-device = "cpu"
 # --- Logging ---
 logging.basicConfig(level=logging.INFO)
-# --- Load LLM ---
 @st.cache_resource
-def load_model():
     checkpoint = "MBZUAI/LaMini-T5-738M"
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
     pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
     return HuggingFacePipeline(pipeline=pipe)
-# --- Extract PDF Text (Improved) ---
-def read_pdf(file):
     try:
         doc = fitz.open(stream=file.read(), filetype="pdf")
-        text = ""
-        # Extract text from each page
         for page in doc:
-            text += page.get_text("text")  # You can also use "dict" for structured text or "html"
-        return text.strip()
     except Exception as e:
-        logging.error(f"Failed to extract text: {e}")
         return ""
-# --- Build Retriever (cached per session) ---
-@st.cache_resource
-def build_retriever(full_text):
-    # Save text to temp file
-    with open("temp_text.txt", "w") as f:
-        f.write(full_text)
-    loader = TextLoader("temp_text.txt")
-    docs = loader.load()
-    # Chunking
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
-    splits = text_splitter.split_documents(docs)
-    # Embeddings
-    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    # Safe temporary directory for Chroma
-    chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db_rag")
-    if os.path.exists(chroma_dir):
-        shutil.rmtree(chroma_dir)
-    os.makedirs(chroma_dir, exist_ok=True)
-    db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
     db.persist()
-    return db.as_retriever(search_kwargs={"k": 6})
-# --- Process Answer ---
-def process_answer(question, full_text, retriever):
-    llm = load_model()
-    # Special handling for summary-type queries
-    if any(x in question.lower() for x in ["summarize", "summary", "tl;dr"]):
-        prompt = f"Summarize the following document:\n\n{full_text[:3000]}"
-        summary = llm(prompt)  # Uses the LLM to generate a summary
-        return summary
-    # --- Prompt Engineering ---
-    # Let's modify how we ask the model to answer
-    prompt = f"""
-    Given the following text, answer the question with a simple and direct 'Yes' or 'No' followed by a brief explanation.
-    Text: {full_text[:3000]}
-    Question: {question}
-    Answer:
-    """
-    # Use RetrievalQA for general queries
-    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
-    response = qa_chain.run(question)
-    return response
-# --- UI Layout ---
 with st.sidebar:
     st.header("📄 Upload PDF")
-    uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
-# --- Main Interface ---
 if uploaded_file:
-    st.success(f"You uploaded: {uploaded_file.name}")
-    full_text = read_pdf(uploaded_file)
     if full_text:
         st.subheader("📁 PDF Preview")
-        with st.expander("View Extracted Text"):
             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
-        st.subheader("💬 Ask a Question")
-        user_question = st.text_input("Type your question about the PDF content")
-        # Build retriever once per session
-        retriever = build_retriever(full_text)
         if user_question:
-            with st.spinner("Thinking..."):
-                answer = process_answer(user_question, full_text, retriever)
                 st.markdown("### 🤖 Answer")
                 st.write(answer)
         with st.sidebar:
             st.markdown("---")
             st.markdown("**💡 Suggestions:**")
-            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
-        with st.expander("💡 Suggestions", expanded=True):
             st.markdown("""
             - "Summarize this document"
-            - "Give a quick summary"
-            - "What are the main points?"
-            - "Explain this document in short"
             """)
     else:
-        st.error("⚠️ No text could be extracted from the PDF. Try another file.")
 else:
     st.info("Upload a PDF to begin.")

 import os
 import streamlit as st
 import fitz  # PyMuPDF
 import tempfile
 import shutil
+import logging
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
 from langchain_community.document_loaders import TextLoader
+from langchain.docstore.document import Document
+# --- Streamlit Config ---
 st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
 st.title("📚 RAG-based PDF Chatbot")
 # --- Logging ---
 logging.basicConfig(level=logging.INFO)
+# --- Load LLM Model ---
 @st.cache_resource
+def load_llm():
     checkpoint = "MBZUAI/LaMini-T5-738M"
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
     pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
     return HuggingFacePipeline(pipeline=pipe)
+# --- PDF Text Extraction ---
+def extract_text_from_pdf(file):
     try:
         doc = fitz.open(stream=file.read(), filetype="pdf")
+        full_text = ""
         for page in doc:
+            full_text += page.get_text()
+        return full_text.strip()
     except Exception as e:
+        logging.error(f"Error reading PDF: {e}")
         return ""
+# --- Build Vectorstore ---
+def create_vectorstore(text_chunks, embeddings):
+    temp_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+    os.makedirs(temp_dir, exist_ok=True)
+    # Wrap each chunk in a Document object
+    documents = [Document(page_content=chunk) for chunk in text_chunks]
+    db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
     db.persist()
+    return db
+# --- Smart Chunking ---
+def chunk_text(full_text):
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=150,
+        separators=["\n\n", "\n", ".", "!", "?", " ", ""]
+    )
+    return splitter.split_text(full_text)
+# --- Answering Logic ---
+def process_question(question, full_text):
+    if not full_text:
+        return "No valid text extracted from PDF."
+    text_chunks = chunk_text(full_text)
+    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    vectorstore = create_vectorstore(text_chunks, embeddings)
+    retriever = vectorstore.as_retriever()
+    llm = load_llm()
+    qa = RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        chain_type="stuff",
+        return_source_documents=False,
+        chain_type_kwargs={
+            "prompt": f"""You are a helpful assistant. Answer the user's question based only on the provided document content.
+If the answer is clearly stated in the document, respond accurately and directly.
+If not, say "The document does not provide enough information." Do not make things up.
+Question: {question}
+Context: {{context}}
+Answer:"""
+        }
+    )
+    return qa.run(question)
+# --- Streamlit UI ---
 with st.sidebar:
     st.header("📄 Upload PDF")
+    uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
 if uploaded_file:
+    st.success(f"Uploaded: {uploaded_file.name}")
+    full_text = extract_text_from_pdf(uploaded_file)
     if full_text:
         st.subheader("📁 PDF Preview")
+        with st.expander("📝 View Extracted Text"):
             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
+        st.subheader("💬 Ask your question")
+        user_question = st.text_input("Enter your question about the PDF")
         if user_question:
+            with st.spinner("🤖 Generating Answer..."):
+                answer = process_question(user_question, full_text)
                 st.markdown("### 🤖 Answer")
                 st.write(answer)
         with st.sidebar:
             st.markdown("---")
             st.markdown("**💡 Suggestions:**")
+            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
             st.markdown("""
             - "Summarize this document"
+            - "What is the background of Pradeep Singh Sengar?"
+            - "What experience does he have?"
+            - "List key skills mentioned in the document."
             """)
     else:
+        st.error("❌ No extractable text found in this PDF. Try another file.")
 else:
     st.info("Upload a PDF to begin.")