Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

App Files Files Community

pradeepsengarr commited on Apr 15

Commit

28c38fd

verified ·

1 Parent(s): 94f70e7

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -70

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import logging
-import torch
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain_community.document_loaders import PDFMinerLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -10,97 +10,93 @@ from langchain_community.vectorstores import Chroma
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
-# Setup
 logging.basicConfig(level=logging.INFO)
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-persist_directory = "db"
-uploaded_files_dir = "uploaded_files"
-os.makedirs(uploaded_files_dir, exist_ok=True)
-checkpoint = "MBZUAI/LaMini-T5-738M"
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
-def data_ingestion():
     try:
-        documents = []
-        for filename in os.listdir(uploaded_files_dir):
-            if filename.endswith(".pdf"):
-                file_path = os.path.join(uploaded_files_dir, filename)
-                loader = PDFMinerLoader(file_path)
-                docs = loader.load()
-                for doc in docs:
-                    if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
-                        documents.append(doc)
-        if not documents:
-            st.error("No valid text extracted from uploaded PDFs.")
             return
         splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
-        texts = splitter.split_documents(documents)
         embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
         db.persist()
-        st.success("Document ingested and stored successfully.")
     except Exception as e:
-        st.error(f"Error during data ingestion: {str(e)}")
-def qa_llm():
     pipe = pipeline(
-        'text2text-generation',
         model=base_model,
         tokenizer=tokenizer,
         max_length=256,
         do_sample=True,
         temperature=0.3,
         top_p=0.95,
-        device=0 if torch.cuda.is_available() else -1
     )
     llm = HuggingFacePipeline(pipeline=pipe)
-    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
-    retriever = db.as_retriever()
-    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
-    return qa
-def process_query(query):
-    try:
-        qa = qa_llm()
-        tailored_prompt = f"""
-        You are an expert chatbot designed to assist Chartered Accountants (CAs) in the field of audits.
-        Your goal is to provide accurate and comprehensive answers to any questions related to audit policies,
-        procedures, and accounting standards based on the uploaded PDF documents.
-        User question: {query}
         """
-        result = qa({"query": tailored_prompt})
-        return result["result"]
-    except Exception as e:
-        return f"Error: {str(e)}"
-# Streamlit UI
-st.set_page_config(page_title="CA Audit Chatbot", layout="centered")
-st.title("📚 Chartered Accountant Audit Assistant")
-st.markdown("Upload a PDF file and ask audit-related questions. This AI assistant will answer based on document content.")
-# File uploader
-uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
-if uploaded_file is not None:
-    save_path = os.path.join(uploaded_files_dir, uploaded_file.name)
-    with open(save_path, "wb") as f:
-        f.write(uploaded_file.getbuffer())
-    st.success("PDF uploaded successfully!")
-    if st.button("Ingest Document"):
-        data_ingestion()
-# Query input
-user_query = st.text_input("Ask a question about the audit document:")
-if user_query:
-    response = process_query(user_query)
-    st.markdown("### 📌 Answer:")
-    st.write(response)

 import os
 import logging
 import streamlit as st
+import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 from langchain_community.document_loaders import PDFMinerLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.llms import HuggingFacePipeline
 from langchain.chains import RetrievalQA
+# Set up logging
 logging.basicConfig(level=logging.INFO)
+# Paths and model
+PERSIST_DIRECTORY = "db"
+UPLOAD_FOLDER = "uploaded_files"
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+CHECKPOINT = "MBZUAI/LaMini-T5-738M"
+tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
+base_model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT)
+device = 0 if torch.cuda.is_available() else -1
+def ingest_data():
     try:
+        st.info("📚 Ingesting documents...")
+        docs = []
+        for file_name in os.listdir(UPLOAD_FOLDER):
+            if file_name.endswith(".pdf"):
+                path = os.path.join(UPLOAD_FOLDER, file_name)
+                loader = PDFMinerLoader(path)
+                loaded_docs = loader.load()
+                docs.extend(loaded_docs)
+        if not docs:
+            st.error("No valid PDFs found.")
             return
         splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
+        texts = splitter.split_documents(docs)
         embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+        db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY)
         db.persist()
+        st.success("✅ Ingestion successful!")
     except Exception as e:
+        logging.error(f"Ingestion error: {str(e)}")
+        st.error(f"Ingestion error: {str(e)}")
+def get_qa_chain():
+    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    vectordb = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)
+    retriever = vectordb.as_retriever()
     pipe = pipeline(
+        "text2text-generation",
         model=base_model,
         tokenizer=tokenizer,
         max_length=256,
         do_sample=True,
         temperature=0.3,
         top_p=0.95,
+        device=device,
     )
     llm = HuggingFacePipeline(pipeline=pipe)
+    qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
+    return qa_chain
+def main():
+    st.set_page_config(page_title="CA Audit QA Chatbot", layout="wide")
+    st.title("📄 CA Audit QA Assistant")
+    with st.sidebar:
+        st.header("📤 Upload Audit PDFs")
+        uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+        if uploaded_file is not None:
+            file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
+            with open(file_path, "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            st.success(f"{uploaded_file.name} uploaded.")
+            ingest_data()
+    query = st.text_input("❓ Ask an audit-related question:")
+    if st.button("🔍 Get Answer") and query:
+        st.info("Generating answer...")
+        qa_chain = get_qa_chain()
+        prompt = f"""
+        You are an AI assistant helping Chartered Accountants (CAs) in auditing.
+        Provide accurate, concise answers based on the uploaded documents.
+        Question: {query}
         """
+        result = qa_chain({"query": prompt})
+        st.success("✅ Answer:")
+        st.write(result["result"])
+if __name__ == "__main__":
+    main()