Spaces:
Sleeping
Sleeping
import os | |
import logging | |
import torch | |
import streamlit as st | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
from langchain_community.document_loaders import PDFMinerLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import SentenceTransformerEmbeddings | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.llms import HuggingFacePipeline | |
from langchain.chains import RetrievalQA | |
# Setup | |
logging.basicConfig(level=logging.INFO) | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
persist_directory = "db" | |
uploaded_files_dir = "uploaded_files" | |
os.makedirs(uploaded_files_dir, exist_ok=True) | |
checkpoint = "MBZUAI/LaMini-T5-738M" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
def data_ingestion(): | |
try: | |
documents = [] | |
for filename in os.listdir(uploaded_files_dir): | |
if filename.endswith(".pdf"): | |
file_path = os.path.join(uploaded_files_dir, filename) | |
loader = PDFMinerLoader(file_path) | |
docs = loader.load() | |
for doc in docs: | |
if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0: | |
documents.append(doc) | |
if not documents: | |
st.error("No valid text extracted from uploaded PDFs.") | |
return | |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) | |
texts = splitter.split_documents(documents) | |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory) | |
db.persist() | |
st.success("Document ingested and stored successfully.") | |
except Exception as e: | |
st.error(f"Error during data ingestion: {str(e)}") | |
def qa_llm(): | |
pipe = pipeline( | |
'text2text-generation', | |
model=base_model, | |
tokenizer=tokenizer, | |
max_length=256, | |
do_sample=True, | |
temperature=0.3, | |
top_p=0.95, | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
llm = HuggingFacePipeline(pipeline=pipe) | |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
db = Chroma(persist_directory=persist_directory, embedding_function=embeddings) | |
retriever = db.as_retriever() | |
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True) | |
return qa | |
def process_query(query): | |
try: | |
qa = qa_llm() | |
tailored_prompt = f""" | |
You are an expert chatbot designed to assist Chartered Accountants (CAs) in the field of audits. | |
Your goal is to provide accurate and comprehensive answers to any questions related to audit policies, | |
procedures, and accounting standards based on the uploaded PDF documents. | |
User question: {query} | |
""" | |
result = qa({"query": tailored_prompt}) | |
return result["result"] | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Streamlit UI | |
st.set_page_config(page_title="CA Audit Chatbot", layout="centered") | |
st.title("π Chartered Accountant Audit Assistant") | |
st.markdown("Upload a PDF file and ask audit-related questions. This AI assistant will answer based on document content.") | |
# File uploader | |
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) | |
if uploaded_file is not None: | |
save_path = os.path.join(uploaded_files_dir, uploaded_file.name) | |
with open(save_path, "wb") as f: | |
f.write(uploaded_file.getbuffer()) | |
st.success("PDF uploaded successfully!") | |
if st.button("Ingest Document"): | |
data_ingestion() | |
# Query input | |
user_query = st.text_input("Ask a question about the audit document:") | |
if user_query: | |
response = process_query(user_query) | |
st.markdown("### π Answer:") | |
st.write(response) | |