Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Apr 15

Commit

bbd8a88

verified ·

1 Parent(s): a908e1c

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -234

app.py CHANGED Viewed

@@ -419,261 +419,118 @@
 import os
-import logging
-import math
 import streamlit as st
 import fitz  # PyMuPDF
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-from langchain_community.document_loaders import PDFMinerLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-# Define global variables
-device = 'cpu'
 persist_directory = "db"
-uploaded_files_dir = "uploaded_files"
-# Streamlit app configuration
-st.set_page_config(page_title="RAG-based Chatbot", layout="wide")
-st.title("RAG-based Chatbot")
-# Load the model
-checkpoint = "MBZUAI/LaMini-T5-738M"
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-# Helper Functions
-def extract_text_from_pdf(file_path):
-    """Extract full text from a PDF using PyMuPDF (fitz)."""
     try:
-        doc = fitz.open(file_path)
         text = ""
-        for page_num in range(doc.page_count):
-            page = doc.load_page(page_num)
-            text += page.get_text("text")
-        return text
     except Exception as e:
-        logging.error(f"Error reading PDF {file_path}: {e}")
-        return None
-def data_ingestion():
-    """Function to load PDFs and create embeddings with improved error handling and efficiency."""
-    try:
-        logging.info("Starting data ingestion")
-        if not os.path.exists(uploaded_files_dir):
-            os.makedirs(uploaded_files_dir)
-        documents = []
-        for filename in os.listdir(uploaded_files_dir):
-            if filename.endswith(".pdf"):
-                file_path = os.path.join(uploaded_files_dir, filename)
-                logging.info(f"Processing file: {file_path}")
-                try:
-                    loader = PDFMinerLoader(file_path)
-                    loaded_docs = loader.load()
-                    if not loaded_docs:
-                        logging.warning(f"Skipping file with missing or invalid metadata: {file_path}")
-                        continue
-                    for doc in loaded_docs:
-                        if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
-                            documents.append(doc)
-                        else:
-                            logging.warning(f"Skipping invalid document structure in {file_path}")
-                except ValueError as e:
-                    logging.error(f"Skipping {file_path}: {str(e)}")
-                    continue
-        if not documents:
-            logging.error("No valid documents found to process.")
-            return
-        logging.info(f"Total valid documents: {len(documents)}")
-        # Proceed with splitting and embedding documents
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
-        texts = text_splitter.split_documents(documents)
-        logging.info(f"Total text chunks created: {len(texts)}")
-        if not texts:
-            logging.error("No valid text chunks to create embeddings.")
-            return
-        embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-        # Proceed to split and embed the documents
-        MAX_BATCH_SIZE = 5461
-        total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
-        logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
-        db = None
-        for i in range(total_batches):
-            batch_start = i * MAX_BATCH_SIZE
-            batch_end = min((i + 1) * MAX_BATCH_SIZE, len(texts))
-            text_batch = texts[batch_start:batch_end]
-            logging.info(f"Processing batch {i + 1}/{total_batches}, size: {len(text_batch)}")
-            if db is None:
-                db = Chroma.from_documents(text_batch, embeddings, persist_directory=persist_directory)
-            else:
-                db.add_documents(text_batch)
-        db.persist()
-        logging.info("Data ingestion completed successfully")
-    except Exception as e:
-        logging.error(f"Error during data ingestion: {str(e)}")
-        raise
-def llm_pipeline():
-    """Set up the language model pipeline."""
-    logging.info("Setting up LLM pipeline")
-    pipe = pipeline(
-        'text2text-generation',
-        model=base_model,
-        tokenizer=tokenizer,
-        max_length=256,
-        do_sample=True,
-        temperature=0.3,
-        top_p=0.95,
-        device=device
-    )
-    local_llm = HuggingFacePipeline(pipeline=pipe)
-    logging.info("LLM pipeline setup complete")
-    return local_llm
-def qa_llm():
-    """Set up the question-answering chain."""
-    logging.info("Setting up QA model")
-    llm = llm_pipeline()
     embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
-    retriever = db.as_retriever()  # Set up the retriever for the vector store
-    qa = RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=retriever,
-        return_source_documents=True
-    )
-    logging.info("QA model setup complete")
-    return qa
-def process_answer(user_question, full_text):
-    """Generate an answer to the user’s question based on the extracted text from the PDF."""
-    try:
-        logging.info("Processing user question")
-        # Set up the retriever with the PDF content (this could be your embedded database or a direct retrieval from full_text)
-        embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-        # Use Chroma for document storage and retrieval if you’re storing documents in a vector store
-        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
-        retriever = db.as_retriever()  # Set up the retriever to use Chroma database
-        # Here we're just adding the full_text as a document for simplicity
-        db.add_documents([full_text])
-        # Set up the language model pipeline (assuming you already have a pipeline set up)
-        llm = llm_pipeline()
-        # Construct the retrieval chain using the retriever and LLM
-        qa_chain = RetrievalQA.from_chain_type(
-            llm=llm,
-            chain_type="stuff",
-            retriever=retriever,
-            return_source_documents=True
-        )
-        # Create a tailored prompt for the question (providing context to the chatbot)
-        tailored_prompt = f"""
-        You are a helpful RAG-based chatbot designed to assist with answering questions from any uploaded document.
-        You should answer the question using relevant information from the provided PDF text.
-        Please provide a clear, informative answer based on the document content.
-        User question: {user_question}
-        """
-        # Generate the answer using the retrieval-augmented generation model
-        generated_text = qa_chain({"query": tailored_prompt})
-        # Extract the generated answer
-        answer = generated_text['result']
-        # If the answer is empty or not very informative, provide a fallback message
-        if "not provide" in answer or "no information" in answer:
-            return "The document does not provide sufficient information to answer your question."
-        logging.info("Answer generated successfully")
-        return answer
     except Exception as e:
-        logging.error(f"Error during answer generation: {str(e)}")
-        return "Sorry, I encountered an issue while processing your question."
-# Streamlit UI Setup
-st.sidebar.header("File Upload")
-uploaded_files = st.sidebar.file_uploader("Upload your PDF files", type=["pdf"], accept_multiple_files=True)
-if uploaded_files:
-    # Save uploaded files and extract their text
-    if not os.path.exists(uploaded_files_dir):
-        os.makedirs(uploaded_files_dir)
-    for uploaded_file in uploaded_files:
-        file_path = os.path.join(uploaded_files_dir, uploaded_file.name)
-        with open(file_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-    st.sidebar.success(f"Uploaded {len(uploaded_files)} file(s) successfully!")
-    # Show the uploaded files' names
-    st.subheader("Uploaded PDF(s):")
-    for uploaded_file in uploaded_files:
-        st.write(uploaded_file.name)
-        # Display PDF preview link if possible
-        with open(file_path, "rb") as f:
-            file_bytes = f.read()
-            st.download_button(
-                label="Download PDF",
-                data=file_bytes,
-                file_name=uploaded_file.name,
-                mime="application/pdf",
-            )
-        # Extract and display the full text from the PDF
-        st.subheader("Full Text from the PDF:")
-        full_text = extract_text_from_pdf(file_path)
-        if full_text:
-            st.text_area("PDF Text", full_text, height=300)
-        else:
-            st.warning("Failed to extract text from this PDF.")
-    # # Generate summary option
-    # if st.button("Generate Summary of Document"):
-    #     st.write("Summary: [Provide the generated summary here]")
-    # Run data ingestion when files are uploaded
-    data_ingestion()
-    # Display UI for Q&A
-    st.header("Ask a Question")
-    user_question = st.text_input("Enter your question here:")
-    if user_question:
-        answer = process_answer(user_question)
-        st.write(answer)
 else:
-    st.sidebar.info("Upload PDF files to get started!")

 import os
 import streamlit as st
 import fitz  # PyMuPDF
+import logging
+import math
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
+from langchain.schema import Document
+# --- Configuration ---
+st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
+st.title("📚 RAG-based PDF Chatbot")
 persist_directory = "db"
+device = "cpu"
+# --- Logging ---
+logging.basicConfig(level=logging.INFO)
+# --- Load LLM ---
+@st.cache_resource
+def load_model():
+    checkpoint = "MBZUAI/LaMini-T5-738M"
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+    pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=512)
+    return HuggingFacePipeline(pipeline=pipe)
+# --- Extract PDF Text ---
+def read_pdf(file):
     try:
+        doc = fitz.open(stream=file.read(), filetype="pdf")
         text = ""
+        for page in doc:
+            text += page.get_text()
+        return text.strip()
     except Exception as e:
+        logging.error(f"Failed to extract text: {e}")
+        return ""
+# --- Split Text into Chunks ---
+def split_text_into_chunks(text):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    return splitter.create_documents([text])
+# --- Create Vector DB ---
+def create_vectorstore(documents):
     embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    db = Chroma.from_documents(documents, embeddings, persist_directory=persist_directory)
+    db.persist()
+    return db
+# --- Setup QA Chain ---
+def setup_qa(db):
+    retriever = db.as_retriever()
+    llm = load_model()
+    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
+# --- Process Answer ---
+def process_answer(user_question, full_text):
+    if not full_text:
+        return "No content was extracted from the PDF. Please try another file."
+    docs = split_text_into_chunks(full_text)
+    db = create_vectorstore(docs)
+    qa = setup_qa(db)
+    prompt = f"""
+    You are a helpful AI assistant. Based on the provided context from a PDF document,
+    generate an accurate, informative answer to the following question:
+    {user_question}
+    """
+    try:
+        result = qa({"query": prompt})
+        return result['result']
     except Exception as e:
+        logging.error(f"Error generating answer: {e}")
+        return "Sorry, I couldn't generate an answer due to an internal error."
+# --- UI Layout ---
+with st.sidebar:
+    st.header("📄 Upload PDF")
+    uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
+# --- Main Interface ---
+if uploaded_file:
+    st.success(f"You uploaded: {uploaded_file.name}")
+    full_text = read_pdf(uploaded_file)
+    if full_text:
+        st.subheader("📑 PDF Preview")
+        with st.expander("View Extracted Text"):
+            st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
+        st.subheader("💬 Ask a Question")
+        user_question = st.text_input("Type your question about the PDF content")
+        if user_question:
+            with st.spinner("Thinking..."):
+                answer = process_answer(user_question, full_text)
+                st.markdown("### 🤖 Answer")
+                st.write(answer)
+        with st.sidebar:
+            st.markdown("---")
+            st.markdown("**💡 Suggestions:**")
+            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\")
+    else:
+        st.error("⚠️ No text could be extracted from the PDF. Try another file.")
 else:
+    st.info("Upload a PDF to begin.")