pradeepsengarr commited on
Commit
709f6b7
Β·
verified Β·
1 Parent(s): a646995

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import torch
4
+ import streamlit as st
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+ from langchain_community.document_loaders import PDFMinerLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
9
+ from langchain_community.vectorstores import Chroma
10
+ from langchain_community.llms import HuggingFacePipeline
11
+ from langchain.chains import RetrievalQA
12
+
13
+ # Setup
14
+ logging.basicConfig(level=logging.INFO)
15
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16
+
17
+ persist_directory = "db"
18
+ uploaded_files_dir = "uploaded_files"
19
+ os.makedirs(uploaded_files_dir, exist_ok=True)
20
+
21
+ checkpoint = "MBZUAI/LaMini-T5-738M"
22
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
23
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
24
+
25
+ def data_ingestion():
26
+ try:
27
+ documents = []
28
+ for filename in os.listdir(uploaded_files_dir):
29
+ if filename.endswith(".pdf"):
30
+ file_path = os.path.join(uploaded_files_dir, filename)
31
+ loader = PDFMinerLoader(file_path)
32
+ docs = loader.load()
33
+ for doc in docs:
34
+ if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
35
+ documents.append(doc)
36
+
37
+ if not documents:
38
+ st.error("No valid text extracted from uploaded PDFs.")
39
+ return
40
+
41
+ splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
42
+ texts = splitter.split_documents(documents)
43
+
44
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
45
+
46
+ db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
47
+ db.persist()
48
+ st.success("Document ingested and stored successfully.")
49
+
50
+ except Exception as e:
51
+ st.error(f"Error during data ingestion: {str(e)}")
52
+
53
+ def qa_llm():
54
+ pipe = pipeline(
55
+ 'text2text-generation',
56
+ model=base_model,
57
+ tokenizer=tokenizer,
58
+ max_length=256,
59
+ do_sample=True,
60
+ temperature=0.3,
61
+ top_p=0.95,
62
+ device=0 if torch.cuda.is_available() else -1
63
+ )
64
+ llm = HuggingFacePipeline(pipeline=pipe)
65
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
66
+ db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
67
+ retriever = db.as_retriever()
68
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
69
+ return qa
70
+
71
+ def process_query(query):
72
+ try:
73
+ qa = qa_llm()
74
+ tailored_prompt = f"""
75
+ You are an expert chatbot designed to assist Chartered Accountants (CAs) in the field of audits.
76
+ Your goal is to provide accurate and comprehensive answers to any questions related to audit policies,
77
+ procedures, and accounting standards based on the uploaded PDF documents.
78
+
79
+ User question: {query}
80
+ """
81
+ result = qa({"query": tailored_prompt})
82
+ return result["result"]
83
+ except Exception as e:
84
+ return f"Error: {str(e)}"
85
+
86
+ # Streamlit UI
87
+ st.set_page_config(page_title="CA Audit Chatbot", layout="centered")
88
+ st.title("πŸ“š Chartered Accountant Audit Assistant")
89
+ st.markdown("Upload a PDF file and ask audit-related questions. This AI assistant will answer based on document content.")
90
+
91
+ # File uploader
92
+ uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
93
+ if uploaded_file is not None:
94
+ save_path = os.path.join(uploaded_files_dir, uploaded_file.name)
95
+ with open(save_path, "wb") as f:
96
+ f.write(uploaded_file.getbuffer())
97
+ st.success("PDF uploaded successfully!")
98
+ if st.button("Ingest Document"):
99
+ data_ingestion()
100
+
101
+ # Query input
102
+ user_query = st.text_input("Ask a question about the audit document:")
103
+ if user_query:
104
+ response = process_query(user_query)
105
+ st.markdown("### πŸ“Œ Answer:")
106
+ st.write(response)