Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Apr 15

Commit

48058d3

verified ·

1 Parent(s): 1eda1ed

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -99

app.py CHANGED Viewed

@@ -422,21 +422,19 @@ import os
 import streamlit as st
 import fitz  # PyMuPDF
 import logging
-import math
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
-from langchain.schema import Document
-from sentence_transformers import SentenceTransformer
-from langchain_community.embeddings import HuggingFaceEmbeddings
 # --- Configuration ---
 st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
 st.title("📚 RAG-based PDF Chatbot")
-persist_directory = "db"
 device = "cpu"
 # --- Logging ---
@@ -463,94 +461,8 @@ def read_pdf(file):
         logging.error(f"Failed to extract text: {e}")
         return ""
-# --- Split Text into Chunks ---
-def split_text_into_chunks(text):
-    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-    return splitter.create_documents([text])
-import os
-import shutil
-from sentence_transformers import SentenceTransformer
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import Chroma
-# Setup a writable directory for Chroma
-chroma_dir = "/home/user/app/chroma_db"  # Change this to an absolute writable directory
-if os.path.exists(chroma_dir):
-    shutil.rmtree(chroma_dir)  # Clear any old data
-os.makedirs(chroma_dir, exist_ok=True)
-# Initialize the model and embeddings
-model = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')
-embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
-# Create the Chroma database
-try:
-    db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
-    db.persist()
-    print(f"Vectorstore created successfully at {chroma_dir}")
-except Exception as e:
-    print(f"Error creating vectorstore: {e}")
-# --- Setup QA Chain ---
-def setup_qa(db):
-    retriever = db.as_retriever()
-    llm = load_model()
-    return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
 # --- Process Answer ---
-# def process_answer(question, full_text):
-#     # STEP 1: Chunk the PDF text
-#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
-#     docs = text_splitter.create_documents([full_text])
-#     # STEP 2: Create embeddings
-#     embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-#     db = Chroma.from_documents(docs, embeddings)
-#     # STEP 3: Retrieve relevant chunks using the question
-#     retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
-#     relevant_docs = retriever.get_relevant_documents(question)
-#     # STEP 4: Format the context
-#     context = "\n\n".join([doc.page_content for doc in relevant_docs])
-#     # STEP 5: Prompting
-#     prompt_template = """
-# You are a helpful assistant that answers questions based on the context below.
-# Context:
-# {context}
-# Question: {question}
-# Answer:
-#     """.strip()
-#     prompt = prompt_template.format(context=context, question=question)
-#     # STEP 6: Load the model and generate response
-#     llm = HuggingFacePipeline.from_model_id(
-#         model_id="MBZUAI/LaMini-T5-738M",
-#         task="text2text-generation",
-#         model_kwargs={"temperature": 0.3, "max_length": 256},
-#     )
-#     return llm.invoke(prompt)
-import tempfile
-import os
 def process_answer(question, full_text):
-    from langchain_community.document_loaders import TextLoader
-    from langchain.text_splitter import RecursiveCharacterTextSplitter
-    from langchain.vectorstores import Chroma
-    from langchain_community.embeddings import SentenceTransformerEmbeddings
-    from langchain.chains import RetrievalQA
-    from langchain import HuggingFacePipeline
-    from transformers import pipeline
     # Save the full_text to a temporary file
     with open("temp_text.txt", "w") as f:
         f.write(full_text)
@@ -568,15 +480,13 @@ def process_answer(question, full_text):
     # Create a temporary directory for ChromaDB
     chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
     if os.path.exists(chroma_dir):
-        import shutil
         shutil.rmtree(chroma_dir)
     db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
     retriever = db.as_retriever()
     # Set up the model
-    pipe = pipeline("text2text-generation", model="MBZUAI/LaMini-T5-738M", max_length=512)
-    llm = HuggingFacePipeline(pipeline=pipe)
     # RAG-style retrieval QA
     qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
@@ -589,12 +499,10 @@ def process_answer(question, full_text):
     else:
         return qa_chain.run(question)
 # --- UI Layout ---
 with st.sidebar:
     st.header("📄 Upload PDF")
     uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
 # --- Main Interface ---
 if uploaded_file:
@@ -602,7 +510,7 @@ if uploaded_file:
     full_text = read_pdf(uploaded_file)
     if full_text:
-        st.subheader("📑 PDF Preview")
         with st.expander("View Extracted Text"):
             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
@@ -618,7 +526,7 @@ if uploaded_file:
         with st.sidebar:
             st.markdown("---")
             st.markdown("**💡 Suggestions:**")
-            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
         with st.expander("💡 Suggestions", expanded=True):
             st.markdown("""
             - "Summarize this document"
@@ -627,7 +535,6 @@ if uploaded_file:
             - "Explain this document in short"
             """)
     else:
         st.error("⚠️ No text could be extracted from the PDF. Try another file.")
 else:

 import streamlit as st
 import fitz  # PyMuPDF
 import logging
+import tempfile
+import shutil
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
 from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
+from langchain_community.document_loaders import TextLoader
 # --- Configuration ---
 st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
 st.title("📚 RAG-based PDF Chatbot")
 device = "cpu"
 # --- Logging ---
         logging.error(f"Failed to extract text: {e}")
         return ""
 # --- Process Answer ---
 def process_answer(question, full_text):
     # Save the full_text to a temporary file
     with open("temp_text.txt", "w") as f:
         f.write(full_text)
     # Create a temporary directory for ChromaDB
     chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
     if os.path.exists(chroma_dir):
         shutil.rmtree(chroma_dir)
     db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
     retriever = db.as_retriever()
     # Set up the model
+    llm = load_model()
     # RAG-style retrieval QA
     qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
     else:
         return qa_chain.run(question)
 # --- UI Layout ---
 with st.sidebar:
     st.header("📄 Upload PDF")
     uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
 # --- Main Interface ---
 if uploaded_file:
     full_text = read_pdf(uploaded_file)
     if full_text:
+        st.subheader("📁 PDF Preview")
         with st.expander("View Extracted Text"):
             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
         with st.sidebar:
             st.markdown("---")
             st.markdown("**💡 Suggestions:**")
+            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\")
         with st.expander("💡 Suggestions", expanded=True):
             st.markdown("""
             - "Summarize this document"
             - "Explain this document in short"
             """)
     else:
         st.error("⚠️ No text could be extracted from the PDF. Try another file.")
 else: