Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on 15 days ago

Commit

0b64652

verified ·

1 Parent(s): 528bb27

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -73

app.py CHANGED Viewed

@@ -1,118 +1,126 @@
 import os
 import streamlit as st
 import fitz  # PyMuPDF
-import tempfile
-import shutil
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
-from langchain.chains import RetrievalQA
 from langchain_community.llms import HuggingFacePipeline
 from langchain.prompts import PromptTemplate
-# --- Streamlit Setup ---
-st.set_page_config(page_title="📚 Accurate RAG PDF Chatbot", layout="wide")
-st.title("📚 Accurate RAG-based PDF Chatbot")
-# --- Load LLM (You can swap with Phi-2 or Mistral 7B later) ---
 @st.cache_resource
-def load_llm():
     checkpoint = "MBZUAI/LaMini-T5-738M"
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-    pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
     return HuggingFacePipeline(pipeline=pipe)
-# --- Load Embeddings ---
-@st.cache_resource
-def load_embeddings():
-    return HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
-# --- PDF Text Extraction ---
-def extract_text_from_pdf(uploaded_file):
     try:
-        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
-        full_text = ""
         for page in doc:
-            full_text += page.get_text()
-        return full_text.strip()
     except Exception as e:
-        st.error(f"❌ Error reading PDF: {e}")
         return ""
-# --- Text Chunking ---
-def chunk_text(full_text):
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
-    return splitter.create_documents([full_text])
-# --- Vectorstore Setup (with in-memory temp directory) ---
-def build_vectorstore(chunks, embeddings):
-    temp_dir = os.path.join(tempfile.gettempdir(), "chromadb-rag")
-    if os.path.exists(temp_dir):
-        shutil.rmtree(temp_dir)
-    os.makedirs(temp_dir, exist_ok=True)
-    return Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=temp_dir)
-# --- Prompt Template ---
-def get_prompt_template():
-    return PromptTemplate(
-        input_variables=["context", "question"],
-        template=(
-            "You are a helpful assistant. Answer the question based only on the following context.\n\n"
-            "Context:\n{context}\n\n"
-            "Question: {question}\n\n"
-            "Answer (Be accurate and concise):"
-        )
-    )
-# --- Answering Logic ---
-def get_answer(question, full_text):
-    if not question or not full_text:
-        return "⚠️ Please provide both PDF and a question."
-    chunks = chunk_text(full_text)
-    embeddings = load_embeddings()
-    vectorstore = build_vectorstore(chunks, embeddings)
-    retriever = vectorstore.as_retriever()
-    llm = load_llm()
     qa_chain = RetrievalQA.from_chain_type(
         llm=llm,
         retriever=retriever,
         chain_type="stuff",
-        chain_type_kwargs={"prompt": get_prompt_template()}
     )
     return qa_chain.run(question)
-# --- UI ---
 with st.sidebar:
     st.header("📄 Upload PDF")
-    uploaded_pdf = st.file_uploader("Upload your PDF", type=["pdf"])
-if uploaded_pdf:
-    st.success(f"✅ Uploaded: {uploaded_pdf.name}")
-    full_text = extract_text_from_pdf(uploaded_pdf)
     if full_text:
-        with st.expander("📄 Preview PDF Text", expanded=False):
             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
-        question = st.text_input("❓ Ask a question about this PDF")
-        if question:
-            with st.spinner("💭 Generating answer..."):
-                answer = get_answer(question, full_text)
                 st.markdown("### 🤖 Answer")
                 st.write(answer)
     else:
-        st.error("⚠️ Could not extract any text from the PDF.")
 else:
-    st.info("📥 Upload a PDF to start.")
-with st.sidebar:
-    st.markdown("---")
-    st.markdown("💡 Try questions like:")
-    st.caption("• What are the key ideas?\n• Summarize the document\n• What is Pradeep Singh Sengar's experience?")

 import os
 import streamlit as st
 import fitz  # PyMuPDF
+import logging
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.llms import HuggingFacePipeline
+from langchain.chains import RetrievalQA
 from langchain.prompts import PromptTemplate
+from langchain_community.document_loaders import TextLoader
+# --- Configuration ---
+st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
+st.title("📚 RAG-based PDF Chatbot")
+device = "cpu"
+# --- Logging ---
+logging.basicConfig(level=logging.INFO)
+# --- Load LLM ---
 @st.cache_resource
+def load_model():
     checkpoint = "MBZUAI/LaMini-T5-738M"
     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+    pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
     return HuggingFacePipeline(pipeline=pipe)
+# --- Extract PDF Text ---
+def read_pdf(file):
     try:
+        doc = fitz.open(stream=file.read(), filetype="pdf")
+        text = ""
         for page in doc:
+            text += page.get_text()
+        return text.strip()
     except Exception as e:
+        logging.error(f"Failed to extract text: {e}")
         return ""
+# --- Process Answer ---
+def process_answer(question, full_text):
+    # Save the full_text to a temporary file
+    with open("temp_text.txt", "w") as f:
+        f.write(full_text)
+    loader = TextLoader("temp_text.txt")
+    docs = loader.load()
+    # Chunk the documents with increased size and overlap
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
+    splits = text_splitter.split_documents(docs)
+    # Load embeddings
+    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    # Create Chroma in-memory vector store
+    db = Chroma.from_documents(splits, embedding=embeddings)
+    retriever = db.as_retriever()
+    # Set up the model
+    llm = load_model()
+    # Create a custom prompt
+    prompt_template = PromptTemplate.from_template("""
+    You are a helpful assistant. Use the following context to answer the question as accurately and thoroughly as possible.
+    Context: {context}
+    Question: {question}
+    Answer in detail:""")
+    # Retrieval QA with custom prompt
     qa_chain = RetrievalQA.from_chain_type(
         llm=llm,
         retriever=retriever,
         chain_type="stuff",
+        chain_type_kwargs={"prompt": prompt_template}
     )
+    # Return the answer using the retrieval QA chain
     return qa_chain.run(question)
+# --- UI Layout ---
 with st.sidebar:
     st.header("📄 Upload PDF")
+    uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
+# --- Main Interface ---
+if uploaded_file:
+    st.success(f"You uploaded: {uploaded_file.name}")
+    full_text = read_pdf(uploaded_file)
     if full_text:
+        st.subheader("📁 PDF Preview")
+        with st.expander("View Extracted Text"):
             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
+        st.subheader("💬 Ask a Question")
+        user_question = st.text_input("Type your question about the PDF content")
+        if user_question:
+            with st.spinner("Thinking..."):
+                answer = process_answer(user_question, full_text)
                 st.markdown("### 🤖 Answer")
                 st.write(answer)
+        with st.sidebar:
+            st.markdown("---")
+            st.markdown("**💡 Suggestions:**")
+            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
+        with st.expander("💡 Suggestions", expanded=True):
+            st.markdown("""
+            - "Summarize this document"
+            - "Give a quick summary"
+            - "What are the main points?"
+            - "Explain this document in short"
+            """)
     else:
+        st.error("⚠️ No text could be extracted from the PDF. Try another file.")
 else:
+    st.info("Upload a PDF to begin.")