Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

File size: 4,014 Bytes

709f6b7

import os
import logging
import torch
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Setup
logging.basicConfig(level=logging.INFO)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

persist_directory = "db"
uploaded_files_dir = "uploaded_files"
os.makedirs(uploaded_files_dir, exist_ok=True)

checkpoint = "MBZUAI/LaMini-T5-738M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

def data_ingestion():
    try:
        documents = []
        for filename in os.listdir(uploaded_files_dir):
            if filename.endswith(".pdf"):
                file_path = os.path.join(uploaded_files_dir, filename)
                loader = PDFMinerLoader(file_path)
                docs = loader.load()
                for doc in docs:
                    if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
                        documents.append(doc)
        
        if not documents:
            st.error("No valid text extracted from uploaded PDFs.")
            return

        splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
        texts = splitter.split_documents(documents)

        embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
        db.persist()
        st.success("Document ingested and stored successfully.")

    except Exception as e:
        st.error(f"Error during data ingestion: {str(e)}")

def qa_llm():
    pipe = pipeline(
        'text2text-generation',
        model=base_model,
        tokenizer=tokenizer,
        max_length=256,
        do_sample=True,
        temperature=0.3,
        top_p=0.95,
        device=0 if torch.cuda.is_available() else -1
    )
    llm = HuggingFacePipeline(pipeline=pipe)
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
    retriever = db.as_retriever()
    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
    return qa

def process_query(query):
    try:
        qa = qa_llm()
        tailored_prompt = f"""
        You are an expert chatbot designed to assist Chartered Accountants (CAs) in the field of audits.
        Your goal is to provide accurate and comprehensive answers to any questions related to audit policies,
        procedures, and accounting standards based on the uploaded PDF documents.

        User question: {query}
        """
        result = qa({"query": tailored_prompt})
        return result["result"]
    except Exception as e:
        return f"Error: {str(e)}"

# Streamlit UI
st.set_page_config(page_title="CA Audit Chatbot", layout="centered")
st.title("📚 Chartered Accountant Audit Assistant")
st.markdown("Upload a PDF file and ask audit-related questions. This AI assistant will answer based on document content.")

# File uploader
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
if uploaded_file is not None:
    save_path = os.path.join(uploaded_files_dir, uploaded_file.name)
    with open(save_path, "wb") as f:
        f.write(uploaded_file.getbuffer())
    st.success("PDF uploaded successfully!")
    if st.button("Ingest Document"):
        data_ingestion()

# Query input
user_query = st.text_input("Ask a question about the audit document:")
if user_query:
    response = process_query(user_query)
    st.markdown("### 📌 Answer:")
    st.write(response)