pradeepsengarr commited on
Commit
cb0ff81
Β·
verified Β·
1 Parent(s): 9192513

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -482
app.py CHANGED
@@ -1,434 +1,18 @@
1
- # import os
2
- # import logging
3
- # import math
4
- # import streamlit as st
5
- # import fitz # PyMuPDF
6
- # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
7
- # from langchain_community.document_loaders import PDFMinerLoader
8
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- # from langchain_community.embeddings import SentenceTransformerEmbeddings
10
- # from langchain_community.vectorstores import Chroma
11
- # from langchain_community.llms import HuggingFacePipeline
12
- # from langchain.chains import RetrievalQA
13
-
14
- # # Set up logging
15
- # logging.basicConfig(level=logging.INFO)
16
-
17
- # # Define global variables
18
- # device = 'cpu'
19
- # persist_directory = "db"
20
- # uploaded_files_dir = "uploaded_files"
21
-
22
- # # Streamlit app configuration
23
- # st.set_page_config(page_title="Audit Assistant", layout="wide")
24
- # st.title("Audit Assistant")
25
-
26
- # # Load the model
27
- # checkpoint = "MBZUAI/LaMini-T5-738M"
28
- # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
29
- # base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
30
-
31
- # # Helper Functions
32
-
33
- # def extract_text_from_pdf(file_path):
34
- # """Extract text from a PDF using PyMuPDF (fitz)."""
35
- # try:
36
- # doc = fitz.open(file_path)
37
- # text = ""
38
- # for page_num in range(doc.page_count):
39
- # page = doc.load_page(page_num)
40
- # text += page.get_text("text")
41
- # return text
42
- # except Exception as e:
43
- # logging.error(f"Error reading PDF {file_path}: {e}")
44
- # return None
45
-
46
- # def data_ingestion():
47
- # """Function to load PDFs and create embeddings with improved error handling and efficiency."""
48
- # try:
49
- # logging.info("Starting data ingestion")
50
-
51
- # if not os.path.exists(uploaded_files_dir):
52
- # os.makedirs(uploaded_files_dir)
53
-
54
- # documents = []
55
- # for filename in os.listdir(uploaded_files_dir):
56
- # if filename.endswith(".pdf"):
57
- # file_path = os.path.join(uploaded_files_dir, filename)
58
- # logging.info(f"Processing file: {file_path}")
59
-
60
- # try:
61
- # loader = PDFMinerLoader(file_path)
62
- # loaded_docs = loader.load()
63
- # if not loaded_docs:
64
- # logging.warning(f"Skipping file with missing or invalid metadata: {file_path}")
65
- # continue
66
-
67
- # for doc in loaded_docs:
68
- # if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
69
- # documents.append(doc)
70
- # else:
71
- # logging.warning(f"Skipping invalid document structure in {file_path}")
72
- # except ValueError as e:
73
- # logging.error(f"Skipping {file_path}: {str(e)}")
74
- # continue
75
-
76
- # if not documents:
77
- # logging.error("No valid documents found to process.")
78
- # return
79
-
80
- # logging.info(f"Total valid documents: {len(documents)}")
81
-
82
- # # Proceed with splitting and embedding documents
83
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
84
- # texts = text_splitter.split_documents(documents)
85
-
86
- # logging.info(f"Total text chunks created: {len(texts)}")
87
-
88
- # if not texts:
89
- # logging.error("No valid text chunks to create embeddings.")
90
- # return
91
-
92
- # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
93
-
94
- # # Proceed to split and embed the documents
95
- # MAX_BATCH_SIZE = 5461
96
- # total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
97
-
98
- # logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
99
-
100
- # db = None
101
- # for i in range(total_batches):
102
- # batch_start = i * MAX_BATCH_SIZE
103
- # batch_end = min((i + 1) * MAX_BATCH_SIZE, len(texts))
104
- # text_batch = texts[batch_start:batch_end]
105
-
106
- # logging.info(f"Processing batch {i + 1}/{total_batches}, size: {len(text_batch)}")
107
-
108
- # if db is None:
109
- # db = Chroma.from_documents(text_batch, embeddings, persist_directory=persist_directory)
110
- # else:
111
- # db.add_documents(text_batch)
112
-
113
- # db.persist()
114
- # logging.info("Data ingestion completed successfully")
115
-
116
- # except Exception as e:
117
- # logging.error(f"Error during data ingestion: {str(e)}")
118
- # raise
119
-
120
- # def llm_pipeline():
121
- # """Set up the language model pipeline."""
122
- # logging.info("Setting up LLM pipeline")
123
- # pipe = pipeline(
124
- # 'text2text-generation',
125
- # model=base_model,
126
- # tokenizer=tokenizer,
127
- # max_length=256,
128
- # do_sample=True,
129
- # temperature=0.3,
130
- # top_p=0.95,
131
- # device=device
132
- # )
133
- # local_llm = HuggingFacePipeline(pipeline=pipe)
134
- # logging.info("LLM pipeline setup complete")
135
- # return local_llm
136
-
137
- # def qa_llm():
138
- # """Set up the question-answering chain."""
139
- # logging.info("Setting up QA model")
140
- # llm = llm_pipeline()
141
- # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
142
- # db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
143
- # retriever = db.as_retriever() # Set up the retriever for the vector store
144
- # qa = RetrievalQA.from_chain_type(
145
- # llm=llm,
146
- # chain_type="stuff",
147
- # retriever=retriever,
148
- # return_source_documents=True
149
- # )
150
- # logging.info("QA model setup complete")
151
- # return qa
152
-
153
- # def process_answer(user_question):
154
- # """Generate an answer to the user’s question."""
155
- # try:
156
- # logging.info("Processing user question")
157
- # qa = qa_llm()
158
-
159
- # tailored_prompt = f"""
160
- # You are an expert chatbot designed to assist Chartered Accountants (CAs) in the field of audits.
161
- # Your goal is to provide accurate and comprehensive answers to any questions related to audit policies, procedures,
162
- # and accounting standards based on the provided PDF documents.
163
- # Please respond effectively and refer to the relevant standards and policies whenever applicable.
164
-
165
- # User question: {user_question}
166
- # """
167
-
168
- # generated_text = qa({"query": tailored_prompt})
169
- # answer = generated_text['result']
170
-
171
- # if "not provide" in answer or "no information" in answer:
172
- # return "The document does not provide sufficient information to answer your question."
173
-
174
- # logging.info("Answer generated successfully")
175
- # return answer
176
-
177
- # except Exception as e:
178
- # logging.error(f"Error during answer generation: {str(e)}")
179
- # return "Error processing the question."
180
-
181
- # # Streamlit UI Setup
182
- # st.sidebar.header("File Upload")
183
- # uploaded_files = st.sidebar.file_uploader("Upload your PDF files", type=["pdf"], accept_multiple_files=True)
184
-
185
- # if uploaded_files:
186
- # # Save uploaded files
187
- # if not os.path.exists(uploaded_files_dir):
188
- # os.makedirs(uploaded_files_dir)
189
-
190
- # for uploaded_file in uploaded_files:
191
- # file_path = os.path.join(uploaded_files_dir, uploaded_file.name)
192
- # with open(file_path, "wb") as f:
193
- # f.write(uploaded_file.getbuffer())
194
-
195
- # st.sidebar.success(f"Uploaded {len(uploaded_files)} file(s) successfully!")
196
-
197
- # # Run data ingestion when files are uploaded
198
- # data_ingestion()
199
-
200
- # # Display UI for Q&A
201
- # st.header("Ask a Question")
202
- # user_question = st.text_input("Enter your question here:")
203
-
204
- # if user_question:
205
- # answer = process_answer(user_question)
206
- # st.write(answer)
207
-
208
- # else:
209
- # st.sidebar.info("Upload PDF files to get started!")
210
-
211
- # # -------this is the second code!!!
212
- # import os
213
- # import logging
214
- # import math
215
- # import streamlit as st
216
- # import fitz # PyMuPDF
217
- # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
218
- # # from langchain_community.document_loaders import PDFMinerLoader
219
- # from langchain_community.document_loaders import PyMuPDFLoader
220
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
221
- # from langchain_community.embeddings import SentenceTransformerEmbeddings
222
- # from langchain_community.vectorstores import Chroma
223
- # from langchain_community.llms import HuggingFacePipeline
224
- # from langchain.chains import RetrievalQA
225
-
226
- # device = 'cpu'
227
- # persist_directory = "db"
228
- # uploaded_files_dir = "uploaded_files"
229
-
230
-
231
- # logging.basicConfig(level=logging.INFO)
232
-
233
- # # for main Page Setup
234
- # st.set_page_config(page_title="RAG Chatbot", layout="wide")
235
- # st.title("πŸ“š RAG-based PDF Assistant")
236
-
237
- # # Load my model
238
- # checkpoint = "MBZUAI/LaMini-T5-738M"
239
- # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
240
- # base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
241
-
242
- # # ------------------------------- #
243
-
244
- # def extract_outline_from_pdf(path):
245
- # try:
246
- # doc = fitz.open(path)
247
- # outline_text = ""
248
- # for page_num in range(len(doc)):
249
- # page = doc[page_num]
250
- # outline_text += f"### Page {page_num+1}:\n{page.get_text('text')[:500]}\n---\n"
251
- # return outline_text if outline_text else "No preview available."
252
- # except Exception as e:
253
- # return f"Could not preview PDF: {e}"
254
-
255
- # def data_ingestion():
256
- # """Load PDFs, validate content, and generate embeddings."""
257
- # try:
258
- # logging.info("Starting data ingestion")
259
-
260
- # if not os.path.exists(uploaded_files_dir):
261
- # os.makedirs(uploaded_files_dir)
262
-
263
- # documents = []
264
- # for filename in os.listdir(uploaded_files_dir):
265
- # if filename.endswith(".pdf"):
266
- # file_path = os.path.join(uploaded_files_dir, filename)
267
- # logging.info(f"Processing file: {file_path}")
268
-
269
- # try:
270
- # loader = PyMuPDFLoader(file_path)
271
- # loaded_docs = loader.load()
272
-
273
- # # Check if any content exists in loaded_docs
274
- # if not loaded_docs or len(loaded_docs[0].page_content.strip()) == 0:
275
- # logging.warning(f"No readable text found in {file_path}. Might be a scanned image or unsupported format.")
276
- # continue
277
-
278
- # for doc in loaded_docs:
279
- # if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0:
280
- # documents.append(doc)
281
- # else:
282
- # logging.warning(f"Skipping invalid document structure in {file_path}")
283
-
284
- # except Exception as e:
285
- # logging.error(f"Skipping {file_path}: {str(e)}")
286
- # continue
287
-
288
- # if not documents:
289
- # logging.error("No valid documents found to process.")
290
- # return
291
-
292
- # logging.info(f"Total valid documents: {len(documents)}")
293
-
294
- # # Proceed with splitting and embedding documents
295
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
296
- # texts = text_splitter.split_documents(documents)
297
-
298
- # logging.info(f"Total text chunks created: {len(texts)}")
299
-
300
- # if not texts:
301
- # logging.error("No valid text chunks to create embeddings.")
302
- # return
303
-
304
- # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
305
-
306
- # MAX_BATCH_SIZE = 5461
307
- # total_batches = math.ceil(len(texts) / MAX_BATCH_SIZE)
308
-
309
- # logging.info(f"Processing {len(texts)} text chunks in {total_batches} batches...")
310
-
311
- # db = None
312
- # for i in range(total_batches):
313
- # batch_start = i * MAX_BATCH_SIZE
314
- # batch_end = min((i + 1) * MAX_BATCH_SIZE, len(texts))
315
- # text_batch = texts[batch_start:batch_end]
316
-
317
- # logging.info(f"Processing batch {i + 1}/{total_batches}, size: {len(text_batch)}")
318
-
319
- # if db is None:
320
- # db = Chroma.from_documents(text_batch, embeddings, persist_directory=persist_directory)
321
- # else:
322
- # db.add_documents(text_batch)
323
-
324
- # db.persist()
325
- # logging.info("Data ingestion completed successfully")
326
-
327
- # except Exception as e:
328
- # logging.error(f"Error during data ingestion: {str(e)}")
329
- # raise
330
-
331
-
332
- # def llm_pipeline():
333
- # pipe = pipeline(
334
- # 'text2text-generation',
335
- # model=base_model,
336
- # tokenizer=tokenizer,
337
- # max_length=256,
338
- # do_sample=True,
339
- # temperature=0.3,
340
- # top_p=0.95,
341
- # device=device
342
- # )
343
- # return HuggingFacePipeline(pipeline=pipe)
344
-
345
- # def qa_llm():
346
- # llm = llm_pipeline()
347
- # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
348
- # db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
349
- # retriever = db.as_retriever()
350
- # return RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
351
-
352
- # def process_answer(user_question):
353
- # """Generate an answer to the user’s question using a general RAG-based prompt."""
354
- # try:
355
- # logging.info("Processing user question")
356
- # qa = qa_llm() # Set up the retrieval-based QA chain
357
-
358
- # # Generalized, flexible prompt for any kind of PDF (resume, legal doc, etc.)
359
- # tailored_prompt = f"""
360
- # You are an intelligent and helpful AI assistant that provides answers strictly based on the provided document contents.
361
- # If the question cannot be answered using the documents, say: 'The document does not contain this information.'
362
- # Otherwise, respond clearly and concisely with relevant and factual details from the PDF.
363
-
364
- # Question: {user_question}
365
- # """
366
-
367
- # generated_text = qa({"query": tailored_prompt})
368
- # answer = generated_text['result']
369
-
370
- # # Add a safeguard for hallucinated answers
371
- # if "not provide" in answer.lower() or "no information" in answer.lower() or len(answer.strip()) < 10:
372
- # return "The document does not contain this information."
373
-
374
- # logging.info("Answer generated successfully")
375
- # return answer
376
-
377
- # except Exception as e:
378
- # logging.error(f"Error during answer generation: {str(e)}")
379
- # return "Sorry, something went wrong while processing your question."
380
-
381
-
382
- # # ---------------- STREAMLIT UI ---------------- #
383
-
384
- # # Sidebar Upload
385
- # st.sidebar.header("πŸ“€ Upload PDF Files")
386
- # uploaded_files = st.sidebar.file_uploader("Select one or more PDF files", type="pdf", accept_multiple_files=True)
387
-
388
- # if uploaded_files:
389
- # if not os.path.exists(uploaded_files_dir):
390
- # os.makedirs(uploaded_files_dir)
391
-
392
- # for file in uploaded_files:
393
- # path = os.path.join(uploaded_files_dir, file.name)
394
- # with open(path, "wb") as f:
395
- # f.write(file.getbuffer())
396
-
397
- # st.sidebar.success(f"{len(uploaded_files)} file(s) uploaded.")
398
-
399
- # # Display previews
400
- # st.subheader("πŸ“„ Uploaded PDF Previews")
401
- # for file in uploaded_files:
402
- # with st.expander(file.name):
403
- # st.text(extract_outline_from_pdf(os.path.join(uploaded_files_dir, file.name)))
404
-
405
- # # Trigger ingestion
406
- # with st.spinner("πŸ”„ Ingesting uploaded documents..."):
407
- # data_ingestion()
408
-
409
- # # Ask a question
410
- # st.header("❓ Ask a Question from Your Documents")
411
- # user_input = st.text_input("Enter your question:")
412
- # if user_input:
413
- # with st.spinner("πŸ’¬ Generating response..."):
414
- # response = process_answer(user_input)
415
- # st.success(response)
416
-
417
- # else:
418
- # st.sidebar.info("Upload PDFs to begin your QA journey.")
419
-
420
  import os
421
- import streamlit as st
 
422
  import fitz # PyMuPDF
 
423
  import logging
 
424
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
425
  from langchain.text_splitter import RecursiveCharacterTextSplitter
426
  from langchain_community.vectorstores import Chroma
427
  from langchain_community.embeddings import SentenceTransformerEmbeddings
428
- from langchain_community.llms import HuggingFacePipeline
429
  from langchain.chains import RetrievalQA
 
430
  from langchain.prompts import PromptTemplate
431
- from langchain.docstore.document import Document
432
 
433
  # --- Streamlit Config ---
434
  st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
@@ -437,60 +21,36 @@ st.title("πŸ“š RAG-based PDF Chatbot")
437
  # --- Logging ---
438
  logging.basicConfig(level=logging.INFO)
439
 
440
- # --- Load LLM Model ---
441
  @st.cache_resource
442
- def load_llm():
443
  checkpoint = "MBZUAI/LaMini-T5-738M"
444
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
445
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
446
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
447
  return HuggingFacePipeline(pipeline=pipe)
448
 
449
- # --- PDF Text Extraction ---
450
  def extract_text_from_pdf(file):
451
  try:
452
  doc = fitz.open(stream=file.read(), filetype="pdf")
453
- full_text = ""
454
- for page in doc:
455
- full_text += page.get_text()
456
- return full_text.strip()
457
  except Exception as e:
458
  logging.error(f"Error reading PDF: {e}")
459
  return ""
460
 
461
- # --- Build Vectorstore (no persist) ---
462
- def create_vectorstore(text_chunks, embeddings):
463
- documents = [Document(page_content=chunk) for chunk in text_chunks]
464
- db = Chroma.from_documents(documents, embedding=embeddings) # ❌ no persist_directory
465
  return db
466
 
467
- # --- Smart Chunking ---
468
- def chunk_text(full_text):
469
- splitter = RecursiveCharacterTextSplitter(
470
- chunk_size=1000,
471
- chunk_overlap=150,
472
- separators=["\n\n", "\n", ".", "!", "?", " ", ""]
473
- )
474
- return splitter.split_text(full_text)
475
-
476
- # --- Answering Logic ---
477
- def process_question(question, full_text):
478
- if not full_text:
479
- return "No valid text extracted from PDF."
480
-
481
- text_chunks = chunk_text(full_text)
482
- embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
483
- vectorstore = create_vectorstore(text_chunks, embeddings)
484
- retriever = vectorstore.as_retriever()
485
-
486
- llm = load_llm()
487
-
488
- # βœ… Custom PromptTemplate
489
  prompt_template = PromptTemplate(
490
  input_variables=["context", "question"],
491
  template="""
492
- You are a helpful assistant. Answer the user's question based only on the provided document context below.
493
- If the answer is in the context, answer it accurately. If not, say: "The document does not provide enough information."
494
 
495
  Context:
496
  {context}
@@ -498,54 +58,70 @@ Context:
498
  Question:
499
  {question}
500
 
501
- Answer:"""
 
502
  )
 
503
 
504
- qa = RetrievalQA.from_chain_type(
505
- llm=llm,
506
- retriever=retriever,
507
- chain_type="stuff",
508
- chain_type_kwargs={"prompt": prompt_template},
509
- return_source_documents=False,
510
- )
511
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  return qa.run(question)
513
 
514
- # --- Streamlit UI ---
515
  with st.sidebar:
516
- st.header("πŸ“„ Upload PDF")
517
- uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
518
 
 
519
  if uploaded_file:
520
  st.success(f"Uploaded: {uploaded_file.name}")
521
  full_text = extract_text_from_pdf(uploaded_file)
522
 
523
  if full_text:
524
- st.subheader("πŸ“ PDF Preview")
525
- with st.expander("πŸ“ View Extracted Text"):
526
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
527
 
528
- st.subheader("πŸ’¬ Ask your question")
529
- user_question = st.text_input("Enter your question about the PDF")
530
 
531
  if user_question:
532
- with st.spinner("πŸ€– Generating Answer..."):
533
- answer = process_question(user_question, full_text)
 
 
 
 
534
  st.markdown("### πŸ€– Answer")
535
  st.write(answer)
536
 
537
  with st.sidebar:
538
  st.markdown("---")
539
- st.markdown("**πŸ’‘ Suggestions:**")
540
- st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
541
  st.markdown("""
542
- - "Summarize this document"
543
- - "What is the background of Pradeep Singh Sengar?"
544
- - "What experience does he have?"
545
- - "List key skills mentioned in the document."
546
  """)
547
  else:
548
- st.error("❌ No extractable text found in this PDF. Try another file.")
549
  else:
550
- st.info("Upload a PDF to begin.")
 
551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import shutil
3
+ import tempfile
4
  import fitz # PyMuPDF
5
+ import streamlit as st
6
  import logging
7
+
8
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain_community.vectorstores import Chroma
11
  from langchain_community.embeddings import SentenceTransformerEmbeddings
 
12
  from langchain.chains import RetrievalQA
13
+ from langchain_community.llms import HuggingFacePipeline
14
  from langchain.prompts import PromptTemplate
15
+ from langchain_community.document_loaders import TextLoader
16
 
17
  # --- Streamlit Config ---
18
  st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
 
21
  # --- Logging ---
22
  logging.basicConfig(level=logging.INFO)
23
 
24
+ # --- Load Model ---
25
  @st.cache_resource
26
+ def load_model():
27
  checkpoint = "MBZUAI/LaMini-T5-738M"
28
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
29
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
30
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
31
  return HuggingFacePipeline(pipeline=pipe)
32
 
33
+ # --- Extract PDF Text ---
34
  def extract_text_from_pdf(file):
35
  try:
36
  doc = fitz.open(stream=file.read(), filetype="pdf")
37
+ return "\n".join([page.get_text() for page in doc])
 
 
 
38
  except Exception as e:
39
  logging.error(f"Error reading PDF: {e}")
40
  return ""
41
 
42
+ # --- Create Chroma Vectorstore Safely ---
43
+ def create_vectorstore(documents, embeddings):
44
+ temp_dir = tempfile.mkdtemp() # unique, writable temp dir
45
+ db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
46
  return db
47
 
48
+ # --- Build RAG QA Chain ---
49
+ def build_qa_chain(retriever, llm):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  prompt_template = PromptTemplate(
51
  input_variables=["context", "question"],
52
  template="""
53
+ You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible.
 
54
 
55
  Context:
56
  {context}
 
58
  Question:
59
  {question}
60
 
61
+ Helpful Answer:
62
+ """
63
  )
64
+ return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template})
65
 
66
+ # --- Process QA ---
67
+ def process_question(question, full_text):
68
+ # Write PDF text to temp file
69
+ with open("temp_text.txt", "w") as f:
70
+ f.write(full_text)
 
 
71
 
72
+ loader = TextLoader("temp_text.txt")
73
+ docs = loader.load()
74
+
75
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
76
+ chunks = text_splitter.split_documents(docs)
77
+
78
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
79
+ vectorstore = create_vectorstore(chunks, embeddings)
80
+ retriever = vectorstore.as_retriever()
81
+
82
+ llm = load_model()
83
+ qa = build_qa_chain(retriever, llm)
84
  return qa.run(question)
85
 
86
+ # --- Sidebar Upload ---
87
  with st.sidebar:
88
+ st.header("πŸ“„ Upload your PDF")
89
+ uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
90
 
91
+ # --- Main Logic ---
92
  if uploaded_file:
93
  st.success(f"Uploaded: {uploaded_file.name}")
94
  full_text = extract_text_from_pdf(uploaded_file)
95
 
96
  if full_text:
97
+ with st.expander("πŸ“„ View Extracted PDF Text", expanded=False):
 
98
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
99
 
100
+ st.subheader("πŸ’¬ Ask Something")
101
+ user_question = st.text_input("Ask a question about the document")
102
 
103
  if user_question:
104
+ with st.spinner("Analyzing..."):
105
+ try:
106
+ answer = process_question(user_question, full_text)
107
+ except Exception as e:
108
+ st.error("⚠️ Something went wrong. Try re-uploading the PDF.")
109
+ st.stop()
110
  st.markdown("### πŸ€– Answer")
111
  st.write(answer)
112
 
113
  with st.sidebar:
114
  st.markdown("---")
115
+ st.caption("πŸ’‘ Sample Questions")
 
116
  st.markdown("""
117
+ - "Summarize the document"
118
+ - "What is the experience of Pradeep Singh Sengar?"
119
+ - "What are the key points?"
120
+ - "Explain in short"
121
  """)
122
  else:
123
+ st.error("❌ Could not extract text. Try a different PDF.")
124
  else:
125
+ st.info("Upload a PDF to get started.")
126
+
127