pradeepsengarr commited on
Commit
38fe9c5
Β·
verified Β·
1 Parent(s): 3794b5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -192
app.py CHANGED
@@ -1,7 +1,8 @@
1
  # import os
2
  # import logging
 
3
  # import streamlit as st
4
- # import torch
5
  # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
  # from langchain_community.document_loaders import PDFMinerLoader
7
  # from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -13,94 +14,201 @@
13
  # # Set up logging
14
  # logging.basicConfig(level=logging.INFO)
15
 
16
- # # Paths and model
17
- # PERSIST_DIRECTORY = "db"
18
- # UPLOAD_FOLDER = "uploaded_files"
19
- # os.makedirs(UPLOAD_FOLDER, exist_ok=True)
20
 
21
- # CHECKPOINT = "MBZUAI/LaMini-T5-738M"
22
- # tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
23
- # base_model = AutoModelForSeq2SeqLM.from_pretrained(CHECKPOINT)
24
- # device = 0 if torch.cuda.is_available() else -1
25
 
26
- # def ingest_data():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  # try:
28
- # st.info("πŸ“š Ingesting documents...")
29
-
30
- # docs = []
31
- # for file_name in os.listdir(UPLOAD_FOLDER):
32
- # if file_name.endswith(".pdf"):
33
- # path = os.path.join(UPLOAD_FOLDER, file_name)
34
- # loader = PDFMinerLoader(path)
35
- # loaded_docs = loader.load()
36
- # docs.extend(loaded_docs)
37
-
38
- # if not docs:
39
- # st.error("No valid PDFs found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # return
41
 
42
- # splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
43
- # texts = splitter.split_documents(docs)
 
 
 
 
 
 
 
 
 
44
 
45
  # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
46
- # db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  # db.persist()
48
- # st.success("βœ… Ingestion successful!")
 
49
  # except Exception as e:
50
- # logging.error(f"Ingestion error: {str(e)}")
51
- # st.error(f"Ingestion error: {str(e)}")
52
-
53
- # def get_qa_chain():
54
- # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
55
- # vectordb = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)
56
- # retriever = vectordb.as_retriever()
57
 
 
 
 
58
  # pipe = pipeline(
59
- # "text2text-generation",
60
  # model=base_model,
61
  # tokenizer=tokenizer,
62
  # max_length=256,
63
  # do_sample=True,
64
  # temperature=0.3,
65
  # top_p=0.95,
66
- # device=device,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # )
68
- # llm = HuggingFacePipeline(pipeline=pipe)
69
-
70
- # qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
71
- # return qa_chain
72
-
73
- # def main():
74
- # st.set_page_config(page_title="CA Audit QA Chatbot", layout="wide")
75
- # st.title("πŸ“„ CA Audit QA Assistant")
76
-
77
- # with st.sidebar:
78
- # st.header("πŸ“€ Upload Audit PDFs")
79
- # uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
80
-
81
- # if uploaded_file is not None:
82
- # file_path = os.path.join(UPLOAD_FOLDER, uploaded_file.name)
83
- # with open(file_path, "wb") as f:
84
- # f.write(uploaded_file.getbuffer())
85
- # st.success(f"{uploaded_file.name} uploaded.")
86
- # ingest_data()
87
-
88
- # query = st.text_input("❓ Ask an audit-related question:")
89
- # if st.button("πŸ” Get Answer") and query:
90
- # st.info("Generating answer...")
91
- # qa_chain = get_qa_chain()
92
- # prompt = f"""
93
- # You are an AI assistant helping Chartered Accountants (CAs) in auditing.
94
- # Provide accurate, concise answers based on the uploaded documents.
95
- # Question: {query}
96
  # """
97
- # result = qa_chain({"query": prompt})
98
- # st.success("βœ… Answer:")
99
- # st.write(result["result"])
100
 
101
- # if __name__ == "__main__":
102
- # main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
 
 
 
 
104
  import os
105
  import logging
106
  import math
@@ -114,115 +222,79 @@ from langchain_community.vectorstores import Chroma
114
  from langchain_community.llms import HuggingFacePipeline
115
  from langchain.chains import RetrievalQA
116
 
117
- # Set up logging
118
- logging.basicConfig(level=logging.INFO)
119
-
120
- # Define global variables
121
  device = 'cpu'
122
  persist_directory = "db"
123
  uploaded_files_dir = "uploaded_files"
124
 
125
- # Streamlit app configuration
126
- st.set_page_config(page_title="Audit Assistant", layout="wide")
127
- st.title("Audit Assistant")
 
 
 
128
 
129
- # Load the model
130
  checkpoint = "MBZUAI/LaMini-T5-738M"
131
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
132
  base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
133
 
134
- # Helper Functions
135
 
136
- def extract_text_from_pdf(file_path):
137
- """Extract text from a PDF using PyMuPDF (fitz)."""
138
  try:
139
- doc = fitz.open(file_path)
140
- text = ""
141
- for page_num in range(doc.page_count):
142
- page = doc.load_page(page_num)
143
- text += page.get_text("text")
144
- return text
145
  except Exception as e:
146
- logging.error(f"Error reading PDF {file_path}: {e}")
147
- return None
148
 
149
  def data_ingestion():
150
- """Function to load PDFs and create embeddings with improved error handling and efficiency."""
151
  try:
152
  logging.info("Starting data ingestion")
153
-
154
  if not os.path.exists(uploaded_files_dir):
155
  os.makedirs(uploaded_files_dir)
156
 
157
- documents = []
158
  for filename in os.listdir(uploaded_files_dir):
159
  if filename.endswith(".pdf"):
160
- file_path = os.path.join(uploaded_files_dir, filename)
161
- logging.info(f"Processing file: {file_path}")
162
-
163
  try:
164
- loader = PDFMinerLoader(file_path)
165
  loaded_docs = loader.load()
166
- if not loaded_docs:
167
- logging.warning(f"Skipping file with missing or invalid metadata: {file_path}")
168
- continue
169
-
170
  for doc in loaded_docs:
171
- if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
172
  documents.append(doc)
173
- else:
174
- logging.warning(f"Skipping invalid document structure in {file_path}")
175
- except ValueError as e:
176
- logging.error(f"Skipping {file_path}: {str(e)}")
177
- continue
178
 
179
  if not documents:
180
- logging.error("No valid documents found to process.")
181
  return
182
 
183
- logging.info(f"Total valid documents: {len(documents)}")
184
-
185
- # Proceed with splitting and embedding documents
186
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
187
  texts = text_splitter.split_documents(documents)
188
 
189
- logging.info(f"Total text chunks created: {len(texts)}")
190
-
191
- if not texts:
192
- logging.error("No valid text chunks to create embeddings.")
193
- return
194
-
195
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
196
-
197
- # Proceed to split and embed the documents
198
- MAX_BATCH_SIZE = 5461
199
- total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
200
-
201
- logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
202
-
203
  db = None
204
- for i in range(total_batches):
205
- batch_start = i * MAX_BATCH_SIZE
206
- batch_end = min((i + 1) * MAX_BATCH_SIZE, len(texts))
207
- text_batch = texts[batch_start:batch_end]
208
-
209
- logging.info(f"Processing batch {i + 1}/{total_batches}, size: {len(text_batch)}")
210
-
211
  if db is None:
212
- db = Chroma.from_documents(text_batch, embeddings, persist_directory=persist_directory)
213
  else:
214
- db.add_documents(text_batch)
215
-
216
  db.persist()
217
- logging.info("Data ingestion completed successfully")
218
-
219
  except Exception as e:
220
- logging.error(f"Error during data ingestion: {str(e)}")
221
- raise
222
 
223
  def llm_pipeline():
224
- """Set up the language model pipeline."""
225
- logging.info("Setting up LLM pipeline")
226
  pipe = pipeline(
227
  'text2text-generation',
228
  model=base_model,
@@ -233,81 +305,65 @@ def llm_pipeline():
233
  top_p=0.95,
234
  device=device
235
  )
236
- local_llm = HuggingFacePipeline(pipeline=pipe)
237
- logging.info("LLM pipeline setup complete")
238
- return local_llm
239
 
240
  def qa_llm():
241
- """Set up the question-answering chain."""
242
- logging.info("Setting up QA model")
243
  llm = llm_pipeline()
244
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
245
  db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
246
- retriever = db.as_retriever() # Set up the retriever for the vector store
247
- qa = RetrievalQA.from_chain_type(
248
- llm=llm,
249
- chain_type="stuff",
250
- retriever=retriever,
251
- return_source_documents=True
252
- )
253
- logging.info("QA model setup complete")
254
- return qa
255
 
256
  def process_answer(user_question):
257
- """Generate an answer to the user’s question."""
258
  try:
259
- logging.info("Processing user question")
260
- qa = qa_llm()
 
 
261
 
262
- tailored_prompt = f"""
263
- You are an expert chatbot designed to assist Chartered Accountants (CAs) in the field of audits.
264
- Your goal is to provide accurate and comprehensive answers to any questions related to audit policies, procedures,
265
- and accounting standards based on the provided PDF documents.
266
- Please respond effectively and refer to the relevant standards and policies whenever applicable.
267
-
268
- User question: {user_question}
269
  """
270
-
271
- generated_text = qa({"query": tailored_prompt})
272
- answer = generated_text['result']
273
-
274
- if "not provide" in answer or "no information" in answer:
275
- return "The document does not provide sufficient information to answer your question."
276
-
277
- logging.info("Answer generated successfully")
278
- return answer
279
-
280
  except Exception as e:
281
- logging.error(f"Error during answer generation: {str(e)}")
282
- return "Error processing the question."
 
 
283
 
284
- # Streamlit UI Setup
285
- st.sidebar.header("File Upload")
286
- uploaded_files = st.sidebar.file_uploader("Upload your PDF files", type=["pdf"], accept_multiple_files=True)
287
 
288
  if uploaded_files:
289
- # Save uploaded files
290
  if not os.path.exists(uploaded_files_dir):
291
  os.makedirs(uploaded_files_dir)
292
 
293
- for uploaded_file in uploaded_files:
294
- file_path = os.path.join(uploaded_files_dir, uploaded_file.name)
295
- with open(file_path, "wb") as f:
296
- f.write(uploaded_file.getbuffer())
297
-
298
- st.sidebar.success(f"Uploaded {len(uploaded_files)} file(s) successfully!")
299
 
300
- # Run data ingestion when files are uploaded
301
- data_ingestion()
 
 
 
302
 
303
- # Display UI for Q&A
304
- st.header("Ask a Question")
305
- user_question = st.text_input("Enter your question here:")
306
 
307
- if user_question:
308
- answer = process_answer(user_question)
309
- st.write(answer)
 
 
 
 
310
 
311
  else:
312
- st.sidebar.info("Upload PDF files to get started!")
313
 
 
1
  # import os
2
  # import logging
3
+ # import math
4
  # import streamlit as st
5
+ # import fitz # PyMuPDF
6
  # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
7
  # from langchain_community.document_loaders import PDFMinerLoader
8
  # from langchain.text_splitter import RecursiveCharacterTextSplitter
 
14
  # # Set up logging
15
  # logging.basicConfig(level=logging.INFO)
16
 
17
+ # # Define global variables
18
+ # device = 'cpu'
19
+ # persist_directory = "db"
20
+ # uploaded_files_dir = "uploaded_files"
21
 
22
+ # # Streamlit app configuration
23
+ # st.set_page_config(page_title="Audit Assistant", layout="wide")
24
+ # st.title("Audit Assistant")
 
25
 
26
+ # # Load the model
27
+ # checkpoint = "MBZUAI/LaMini-T5-738M"
28
+ # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
29
+ # base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
30
+
31
+ # # Helper Functions
32
+
33
+ # def extract_text_from_pdf(file_path):
34
+ # """Extract text from a PDF using PyMuPDF (fitz)."""
35
+ # try:
36
+ # doc = fitz.open(file_path)
37
+ # text = ""
38
+ # for page_num in range(doc.page_count):
39
+ # page = doc.load_page(page_num)
40
+ # text += page.get_text("text")
41
+ # return text
42
+ # except Exception as e:
43
+ # logging.error(f"Error reading PDF {file_path}: {e}")
44
+ # return None
45
+
46
+ # def data_ingestion():
47
+ # """Function to load PDFs and create embeddings with improved error handling and efficiency."""
48
  # try:
49
+ # logging.info("Starting data ingestion")
50
+
51
+ # if not os.path.exists(uploaded_files_dir):
52
+ # os.makedirs(uploaded_files_dir)
53
+
54
+ # documents = []
55
+ # for filename in os.listdir(uploaded_files_dir):
56
+ # if filename.endswith(".pdf"):
57
+ # file_path = os.path.join(uploaded_files_dir, filename)
58
+ # logging.info(f"Processing file: {file_path}")
59
+
60
+ # try:
61
+ # loader = PDFMinerLoader(file_path)
62
+ # loaded_docs = loader.load()
63
+ # if not loaded_docs:
64
+ # logging.warning(f"Skipping file with missing or invalid metadata: {file_path}")
65
+ # continue
66
+
67
+ # for doc in loaded_docs:
68
+ # if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
69
+ # documents.append(doc)
70
+ # else:
71
+ # logging.warning(f"Skipping invalid document structure in {file_path}")
72
+ # except ValueError as e:
73
+ # logging.error(f"Skipping {file_path}: {str(e)}")
74
+ # continue
75
+
76
+ # if not documents:
77
+ # logging.error("No valid documents found to process.")
78
  # return
79
 
80
+ # logging.info(f"Total valid documents: {len(documents)}")
81
+
82
+ # # Proceed with splitting and embedding documents
83
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
84
+ # texts = text_splitter.split_documents(documents)
85
+
86
+ # logging.info(f"Total text chunks created: {len(texts)}")
87
+
88
+ # if not texts:
89
+ # logging.error("No valid text chunks to create embeddings.")
90
+ # return
91
 
92
  # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
93
+
94
+ # # Proceed to split and embed the documents
95
+ # MAX_BATCH_SIZE = 5461
96
+ # total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
97
+
98
+ # logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
99
+
100
+ # db = None
101
+ # for i in range(total_batches):
102
+ # batch_start = i * MAX_BATCH_SIZE
103
+ # batch_end = min((i + 1) * MAX_BATCH_SIZE, len(texts))
104
+ # text_batch = texts[batch_start:batch_end]
105
+
106
+ # logging.info(f"Processing batch {i + 1}/{total_batches}, size: {len(text_batch)}")
107
+
108
+ # if db is None:
109
+ # db = Chroma.from_documents(text_batch, embeddings, persist_directory=persist_directory)
110
+ # else:
111
+ # db.add_documents(text_batch)
112
+
113
  # db.persist()
114
+ # logging.info("Data ingestion completed successfully")
115
+
116
  # except Exception as e:
117
+ # logging.error(f"Error during data ingestion: {str(e)}")
118
+ # raise
 
 
 
 
 
119
 
120
+ # def llm_pipeline():
121
+ # """Set up the language model pipeline."""
122
+ # logging.info("Setting up LLM pipeline")
123
  # pipe = pipeline(
124
+ # 'text2text-generation',
125
  # model=base_model,
126
  # tokenizer=tokenizer,
127
  # max_length=256,
128
  # do_sample=True,
129
  # temperature=0.3,
130
  # top_p=0.95,
131
+ # device=device
132
+ # )
133
+ # local_llm = HuggingFacePipeline(pipeline=pipe)
134
+ # logging.info("LLM pipeline setup complete")
135
+ # return local_llm
136
+
137
+ # def qa_llm():
138
+ # """Set up the question-answering chain."""
139
+ # logging.info("Setting up QA model")
140
+ # llm = llm_pipeline()
141
+ # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
142
+ # db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
143
+ # retriever = db.as_retriever() # Set up the retriever for the vector store
144
+ # qa = RetrievalQA.from_chain_type(
145
+ # llm=llm,
146
+ # chain_type="stuff",
147
+ # retriever=retriever,
148
+ # return_source_documents=True
149
  # )
150
+ # logging.info("QA model setup complete")
151
+ # return qa
152
+
153
+ # def process_answer(user_question):
154
+ # """Generate an answer to the user’s question."""
155
+ # try:
156
+ # logging.info("Processing user question")
157
+ # qa = qa_llm()
158
+
159
+ # tailored_prompt = f"""
160
+ # You are an expert chatbot designed to assist Chartered Accountants (CAs) in the field of audits.
161
+ # Your goal is to provide accurate and comprehensive answers to any questions related to audit policies, procedures,
162
+ # and accounting standards based on the provided PDF documents.
163
+ # Please respond effectively and refer to the relevant standards and policies whenever applicable.
164
+
165
+ # User question: {user_question}
 
 
 
 
 
 
 
 
 
 
 
 
166
  # """
 
 
 
167
 
168
+ # generated_text = qa({"query": tailored_prompt})
169
+ # answer = generated_text['result']
170
+
171
+ # if "not provide" in answer or "no information" in answer:
172
+ # return "The document does not provide sufficient information to answer your question."
173
+
174
+ # logging.info("Answer generated successfully")
175
+ # return answer
176
+
177
+ # except Exception as e:
178
+ # logging.error(f"Error during answer generation: {str(e)}")
179
+ # return "Error processing the question."
180
+
181
+ # # Streamlit UI Setup
182
+ # st.sidebar.header("File Upload")
183
+ # uploaded_files = st.sidebar.file_uploader("Upload your PDF files", type=["pdf"], accept_multiple_files=True)
184
+
185
+ # if uploaded_files:
186
+ # # Save uploaded files
187
+ # if not os.path.exists(uploaded_files_dir):
188
+ # os.makedirs(uploaded_files_dir)
189
+
190
+ # for uploaded_file in uploaded_files:
191
+ # file_path = os.path.join(uploaded_files_dir, uploaded_file.name)
192
+ # with open(file_path, "wb") as f:
193
+ # f.write(uploaded_file.getbuffer())
194
+
195
+ # st.sidebar.success(f"Uploaded {len(uploaded_files)} file(s) successfully!")
196
+
197
+ # # Run data ingestion when files are uploaded
198
+ # data_ingestion()
199
+
200
+ # # Display UI for Q&A
201
+ # st.header("Ask a Question")
202
+ # user_question = st.text_input("Enter your question here:")
203
+
204
+ # if user_question:
205
+ # answer = process_answer(user_question)
206
+ # st.write(answer)
207
 
208
+ # else:
209
+ # st.sidebar.info("Upload PDF files to get started!")
210
+
211
+ # -------
212
  import os
213
  import logging
214
  import math
 
222
  from langchain_community.llms import HuggingFacePipeline
223
  from langchain.chains import RetrievalQA
224
 
225
+ # Configuration
 
 
 
226
  device = 'cpu'
227
  persist_directory = "db"
228
  uploaded_files_dir = "uploaded_files"
229
 
230
+ # Setup logging
231
+ logging.basicConfig(level=logging.INFO)
232
+
233
+ # Streamlit Page Setup
234
+ st.set_page_config(page_title="RAG Chatbot", layout="wide")
235
+ st.title("πŸ“š RAG-based PDF Assistant")
236
 
237
+ # Load LLM model
238
  checkpoint = "MBZUAI/LaMini-T5-738M"
239
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
240
  base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
241
 
242
+ # ---------------- HELPER FUNCTIONS ---------------- #
243
 
244
+ def extract_outline_from_pdf(path):
 
245
  try:
246
+ doc = fitz.open(path)
247
+ outline_text = ""
248
+ for page_num in range(len(doc)):
249
+ page = doc[page_num]
250
+ outline_text += f"### Page {page_num+1}:\n{page.get_text('text')[:500]}\n---\n"
251
+ return outline_text if outline_text else "No preview available."
252
  except Exception as e:
253
+ return f"Could not preview PDF: {e}"
 
254
 
255
  def data_ingestion():
 
256
  try:
257
  logging.info("Starting data ingestion")
 
258
  if not os.path.exists(uploaded_files_dir):
259
  os.makedirs(uploaded_files_dir)
260
 
261
+ documents = []
262
  for filename in os.listdir(uploaded_files_dir):
263
  if filename.endswith(".pdf"):
264
+ path = os.path.join(uploaded_files_dir, filename)
265
+ logging.info(f"Loading: {filename}")
 
266
  try:
267
+ loader = PDFMinerLoader(path)
268
  loaded_docs = loader.load()
 
 
 
 
269
  for doc in loaded_docs:
270
+ if hasattr(doc, 'page_content'):
271
  documents.append(doc)
272
+ except Exception as e:
273
+ logging.warning(f"Skipping {filename}: {str(e)}")
 
 
 
274
 
275
  if not documents:
276
+ st.error("⚠️ No valid documents found. Check the PDF content.")
277
  return
278
 
 
 
 
279
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
280
  texts = text_splitter.split_documents(documents)
281
 
 
 
 
 
 
 
282
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
 
 
 
 
 
 
 
283
  db = None
284
+ MAX_BATCH_SIZE = 5461
285
+ for i in range(0, len(texts), MAX_BATCH_SIZE):
286
+ batch = texts[i:i + MAX_BATCH_SIZE]
 
 
 
 
287
  if db is None:
288
+ db = Chroma.from_documents(batch, embeddings, persist_directory=persist_directory)
289
  else:
290
+ db.add_documents(batch)
 
291
  db.persist()
292
+ logging.info("Data ingestion completed.")
 
293
  except Exception as e:
294
+ logging.error(f"Ingestion error: {e}")
295
+ st.error(f"Ingestion failed: {e}")
296
 
297
  def llm_pipeline():
 
 
298
  pipe = pipeline(
299
  'text2text-generation',
300
  model=base_model,
 
305
  top_p=0.95,
306
  device=device
307
  )
308
+ return HuggingFacePipeline(pipeline=pipe)
 
 
309
 
310
  def qa_llm():
 
 
311
  llm = llm_pipeline()
312
  embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
313
  db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
314
+ retriever = db.as_retriever()
315
+ return RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
 
 
 
 
 
 
 
316
 
317
  def process_answer(user_question):
 
318
  try:
319
+ qa = qa_llm()
320
+ prompt = f"""
321
+ You are a helpful and accurate RAG-based chatbot. Your role is to analyze the content from uploaded PDF documents and
322
+ provide informative and detailed answers to any questions asked by the user. Use the uploaded knowledge to answer precisely.
323
 
324
+ Question: {user_question}
 
 
 
 
 
 
325
  """
326
+ output = qa({"query": prompt})
327
+ return output['result']
 
 
 
 
 
 
 
 
328
  except Exception as e:
329
+ logging.error(f"QA failed: {e}")
330
+ return "❌ Could not generate a valid answer."
331
+
332
+ # ---------------- STREAMLIT UI ---------------- #
333
 
334
+ # Sidebar Upload
335
+ st.sidebar.header("πŸ“€ Upload PDF Files")
336
+ uploaded_files = st.sidebar.file_uploader("Select one or more PDF files", type="pdf", accept_multiple_files=True)
337
 
338
  if uploaded_files:
 
339
  if not os.path.exists(uploaded_files_dir):
340
  os.makedirs(uploaded_files_dir)
341
 
342
+ for file in uploaded_files:
343
+ path = os.path.join(uploaded_files_dir, file.name)
344
+ with open(path, "wb") as f:
345
+ f.write(file.getbuffer())
346
+
347
+ st.sidebar.success(f"{len(uploaded_files)} file(s) uploaded.")
348
 
349
+ # Display previews
350
+ st.subheader("πŸ“„ Uploaded PDF Previews")
351
+ for file in uploaded_files:
352
+ with st.expander(file.name):
353
+ st.text(extract_outline_from_pdf(os.path.join(uploaded_files_dir, file.name)))
354
 
355
+ # Trigger ingestion
356
+ with st.spinner("πŸ”„ Ingesting uploaded documents..."):
357
+ data_ingestion()
358
 
359
+ # Ask a question
360
+ st.header("❓ Ask a Question from Your Documents")
361
+ user_input = st.text_input("Enter your question:")
362
+ if user_input:
363
+ with st.spinner("πŸ’¬ Generating response..."):
364
+ response = process_answer(user_input)
365
+ st.success(response)
366
 
367
  else:
368
+ st.sidebar.info("Upload PDFs to begin your QA journey.")
369