Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on 16 days ago

Commit

c46f62c

verified ·

1 Parent(s): 1b0749c

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -131

app.py CHANGED Viewed

@@ -1,137 +1,137 @@
-import os
-import streamlit as st
-import fitz  # PyMuPDF
-import logging
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import Chroma
-from langchain_community.embeddings import SentenceTransformerEmbeddings
-from langchain_community.llms import HuggingFacePipeline
-from langchain.chains import RetrievalQA
-from langchain.prompts import PromptTemplate
-from langchain_community.document_loaders import TextLoader
-# --- Configuration ---
-st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
-st.title("📚 RAG-based PDF Chatbot")
-device = "cpu"
-# --- Logging ---
-logging.basicConfig(level=logging.INFO)
-# --- Load LLM ---
-@st.cache_resource
-def load_model():
-    checkpoint = "MBZUAI/LaMini-T5-738M"
-    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-    pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
-    return HuggingFacePipeline(pipeline=pipe)
-# --- Extract PDF Text ---
-def read_pdf(file):
-    try:
-        doc = fitz.open(stream=file.read(), filetype="pdf")
-        text = ""
-        for page in doc:
-            text += page.get_text()
-        return text.strip()
-    except Exception as e:
-        logging.error(f"Failed to extract text: {e}")
-        return ""
-# --- Process Answer ---dd
-def process_answer(question, full_text):
-    # Save the full_text to a temporary file
-    with open("temp_text.txt", "w") as f:
-        f.write(full_text)
-    loader = TextLoader("temp_text.txt")
-    docs = loader.load()
-    # Chunk the documents with increased size and overlap
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
-    splits = text_splitter.split_documents(docs)
-    # Load embeddings
-    embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")
-    # Create Chroma in-memory vector store
-    db = Chroma.from_documents(splits, embedding=embeddings)
-    retriever = db.as_retriever()
-    # Set up the model
-    llm = load_model()
-    # Create a custom prompt
-    prompt_template = PromptTemplate(
-    input_variables=["context", "question"],
-    template="""
-    You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
-    Context:
-    {context}
-    Question:
-    {question}
-    Important Instructions:
-    - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
-    - Do NOT summarize or paraphrase.
-    - If the information is not in the context, say "Not found in the document."
-    Answer:
-    """)
-    # Retrieval QA with custom prompt
-    qa_chain = RetrievalQA.from_chain_type(
-        llm=llm,
-        retriever=retriever,
-        chain_type="stuff",
-        chain_type_kwargs={"prompt": prompt_template}
-    )
-    # Return the answer using the retrieval QA chain
-    return qa_chain.run(question)
-# --- UI Layout ---
-with st.sidebar:
-    st.header("📄 Upload PDF")
-    uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
-# --- Main Interface ---
-if uploaded_file:
-    st.success(f"You uploaded: {uploaded_file.name}")
-    full_text = read_pdf(uploaded_file)
-    if full_text:
-        st.subheader("📁 PDF Preview")
-        with st.expander("View Extracted Text"):
-            st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
-        st.subheader("💬 Ask a Question")
-        user_question = st.text_input("Type your question about the PDF content")
-        if user_question:
-            with st.spinner("Thinking..."):
-                answer = process_answer(user_question, full_text)
-                st.markdown("### 🤖 Answer")
-                st.write(answer)
-        with st.sidebar:
-            st.markdown("---")
-            st.markdown("**💡 Suggestions:**")
-            st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
-        with st.expander("💡 Suggestions", expanded=True):
-            st.markdown("""
-            - "Summarize this document"
-            - "Give a quick summary"
-            - "What are the main points?"
-            - "Explain this document in short"
-            """)
-    else:
-        st.error("⚠️ No text could be extracted from the PDF. Try another file.")
-else:
-    st.info("Upload a PDF to begin.")

+# import os
+# import streamlit as st
+# import fitz  # PyMuPDF
+# import logging
+# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain_community.vectorstores import Chroma
+# from langchain_community.embeddings import SentenceTransformerEmbeddings
+# from langchain_community.llms import HuggingFacePipeline
+# from langchain.chains import RetrievalQA
+# from langchain.prompts import PromptTemplate
+# from langchain_community.document_loaders import TextLoader
+# # --- Configuration ---
+# st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
+# st.title("📚 RAG-based PDF Chatbot")
+# device = "cpu"
+# # --- Logging ---
+# logging.basicConfig(level=logging.INFO)
+# # --- Load LLM ---
+# @st.cache_resource
+# def load_model():
+#     checkpoint = "MBZUAI/LaMini-T5-738M"
+#     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+#     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+#     pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
+#     return HuggingFacePipeline(pipeline=pipe)
+# # --- Extract PDF Text ---
+# def read_pdf(file):
+#     try:
+#         doc = fitz.open(stream=file.read(), filetype="pdf")
+#         text = ""
+#         for page in doc:
+#             text += page.get_text()
+#         return text.strip()
+#     except Exception as e:
+#         logging.error(f"Failed to extract text: {e}")
+#         return ""
+# # --- Process Answer ---dd
+# def process_answer(question, full_text):
+#     # Save the full_text to a temporary file
+#     with open("temp_text.txt", "w") as f:
+#         f.write(full_text)
+#     loader = TextLoader("temp_text.txt")
+#     docs = loader.load()
+#     # Chunk the documents with increased size and overlap
+#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
+#     splits = text_splitter.split_documents(docs)
+#     # Load embeddings
+#     embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+#     # Create Chroma in-memory vector store
+#     db = Chroma.from_documents(splits, embedding=embeddings)
+#     retriever = db.as_retriever()
+#     # Set up the model
+#     llm = load_model()
+#     # Create a custom prompt
+#     prompt_template = PromptTemplate(
+#     input_variables=["context", "question"],
+#     template="""
+#     You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
+#     Context:
+#     {context}
+#     Question:
+#     {question}
+#     Important Instructions:
+#     - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
+#     - Do NOT summarize or paraphrase.
+#     - If the information is not in the context, say "Not found in the document."
+#     Answer:
+#     """)
+#     # Retrieval QA with custom prompt
+#     qa_chain = RetrievalQA.from_chain_type(
+#         llm=llm,
+#         retriever=retriever,
+#         chain_type="stuff",
+#         chain_type_kwargs={"prompt": prompt_template}
+#     )
+#     # Return the answer using the retrieval QA chain
+#     return qa_chain.run(question)
+# # --- UI Layout ---
+# with st.sidebar:
+#     st.header("📄 Upload PDF")
+#     uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
+# # --- Main Interface ---
+# if uploaded_file:
+#     st.success(f"You uploaded: {uploaded_file.name}")
+#     full_text = read_pdf(uploaded_file)
+#     if full_text:
+#         st.subheader("📁 PDF Preview")
+#         with st.expander("View Extracted Text"):
+#             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
+#         st.subheader("💬 Ask a Question")
+#         user_question = st.text_input("Type your question about the PDF content")
+#         if user_question:
+#             with st.spinner("Thinking..."):
+#                 answer = process_answer(user_question, full_text)
+#                 st.markdown("### 🤖 Answer")
+#                 st.write(answer)
+#         with st.sidebar:
+#             st.markdown("---")
+#             st.markdown("**💡 Suggestions:**")
+#             st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
+#         with st.expander("💡 Suggestions", expanded=True):
+#             st.markdown("""
+#             - "Summarize this document"
+#             - "Give a quick summary"
+#             - "What are the main points?"
+#             - "Explain this document in short"
+#             """)
+#     else:
+#         st.error("⚠️ No text could be extracted from the PDF. Try another file.")
+# else:
+#     st.info("Upload a PDF to begin.")