pradeepsengarr commited on
Commit
8d47fc3
Β·
verified Β·
1 Parent(s): e428e3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -78
app.py CHANGED
@@ -420,9 +420,9 @@
420
  import os
421
  import streamlit as st
422
  import fitz # PyMuPDF
423
- import logging
424
  import tempfile
425
  import shutil
 
426
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
427
  from langchain.text_splitter import RecursiveCharacterTextSplitter
428
  from langchain_community.vectorstores import Chroma
@@ -430,132 +430,125 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
430
  from langchain_community.llms import HuggingFacePipeline
431
  from langchain.chains import RetrievalQA
432
  from langchain_community.document_loaders import TextLoader
 
433
 
434
- # --- Configuration ---
435
  st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
436
  st.title("πŸ“š RAG-based PDF Chatbot")
437
- device = "cpu"
438
 
439
  # --- Logging ---
440
  logging.basicConfig(level=logging.INFO)
441
 
442
- # --- Load LLM ---
443
  @st.cache_resource
444
- def load_model():
445
  checkpoint = "MBZUAI/LaMini-T5-738M"
446
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
447
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
448
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
449
  return HuggingFacePipeline(pipeline=pipe)
450
 
451
- # --- Extract PDF Text (Improved) ---
452
- def read_pdf(file):
453
  try:
454
  doc = fitz.open(stream=file.read(), filetype="pdf")
455
- text = ""
456
- # Extract text from each page
457
  for page in doc:
458
- text += page.get_text("text") # You can also use "dict" for structured text or "html"
459
- return text.strip()
460
  except Exception as e:
461
- logging.error(f"Failed to extract text: {e}")
462
  return ""
463
 
464
- # --- Build Retriever (cached per session) ---
465
- @st.cache_resource
466
- def build_retriever(full_text):
467
- # Save text to temp file
468
- with open("temp_text.txt", "w") as f:
469
- f.write(full_text)
470
-
471
- loader = TextLoader("temp_text.txt")
472
- docs = loader.load()
473
-
474
- # Chunking
475
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
476
- splits = text_splitter.split_documents(docs)
477
-
478
- # Embeddings
479
- embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
480
-
481
- # Safe temporary directory for Chroma
482
- chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db_rag")
483
- if os.path.exists(chroma_dir):
484
- shutil.rmtree(chroma_dir)
485
- os.makedirs(chroma_dir, exist_ok=True)
486
 
487
- db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
 
 
488
  db.persist()
489
- return db.as_retriever(search_kwargs={"k": 6})
490
-
491
- # --- Process Answer ---
492
- def process_answer(question, full_text, retriever):
493
- llm = load_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
 
495
- # Special handling for summary-type queries
496
- if any(x in question.lower() for x in ["summarize", "summary", "tl;dr"]):
497
- prompt = f"Summarize the following document:\n\n{full_text[:3000]}"
498
- summary = llm(prompt) # Uses the LLM to generate a summary
499
- return summary
 
 
 
500
 
501
- # --- Prompt Engineering ---
502
- # Let's modify how we ask the model to answer
503
- prompt = f"""
504
- Given the following text, answer the question with a simple and direct 'Yes' or 'No' followed by a brief explanation.
505
 
506
- Text: {full_text[:3000]}
507
-
508
- Question: {question}
509
- Answer:
510
- """
511
 
512
- # Use RetrievalQA for general queries
513
- qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
514
- response = qa_chain.run(question)
 
 
515
 
516
- return response
517
 
518
- # --- UI Layout ---
519
  with st.sidebar:
520
  st.header("πŸ“„ Upload PDF")
521
- uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
522
 
523
- # --- Main Interface ---
524
  if uploaded_file:
525
- st.success(f"You uploaded: {uploaded_file.name}")
526
- full_text = read_pdf(uploaded_file)
527
 
528
  if full_text:
529
  st.subheader("πŸ“ PDF Preview")
530
- with st.expander("View Extracted Text"):
531
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
532
 
533
- st.subheader("πŸ’¬ Ask a Question")
534
- user_question = st.text_input("Type your question about the PDF content")
535
-
536
- # Build retriever once per session
537
- retriever = build_retriever(full_text)
538
 
539
  if user_question:
540
- with st.spinner("Thinking..."):
541
- answer = process_answer(user_question, full_text, retriever)
542
  st.markdown("### πŸ€– Answer")
543
  st.write(answer)
544
 
545
  with st.sidebar:
546
  st.markdown("---")
547
  st.markdown("**πŸ’‘ Suggestions:**")
548
- st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
549
- with st.expander("πŸ’‘ Suggestions", expanded=True):
550
  st.markdown("""
551
  - "Summarize this document"
552
- - "Give a quick summary"
553
- - "What are the main points?"
554
- - "Explain this document in short"
555
  """)
556
-
557
  else:
558
- st.error("⚠️ No text could be extracted from the PDF. Try another file.")
559
  else:
560
  st.info("Upload a PDF to begin.")
561
 
 
 
420
  import os
421
  import streamlit as st
422
  import fitz # PyMuPDF
 
423
  import tempfile
424
  import shutil
425
+ import logging
426
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
427
  from langchain.text_splitter import RecursiveCharacterTextSplitter
428
  from langchain_community.vectorstores import Chroma
 
430
  from langchain_community.llms import HuggingFacePipeline
431
  from langchain.chains import RetrievalQA
432
  from langchain_community.document_loaders import TextLoader
433
+ from langchain.docstore.document import Document
434
 
435
+ # --- Streamlit Config ---
436
  st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
437
  st.title("πŸ“š RAG-based PDF Chatbot")
 
438
 
439
  # --- Logging ---
440
  logging.basicConfig(level=logging.INFO)
441
 
442
+ # --- Load LLM Model ---
443
  @st.cache_resource
444
+ def load_llm():
445
  checkpoint = "MBZUAI/LaMini-T5-738M"
446
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
447
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
448
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
449
  return HuggingFacePipeline(pipeline=pipe)
450
 
451
+ # --- PDF Text Extraction ---
452
+ def extract_text_from_pdf(file):
453
  try:
454
  doc = fitz.open(stream=file.read(), filetype="pdf")
455
+ full_text = ""
 
456
  for page in doc:
457
+ full_text += page.get_text()
458
+ return full_text.strip()
459
  except Exception as e:
460
+ logging.error(f"Error reading PDF: {e}")
461
  return ""
462
 
463
+ # --- Build Vectorstore ---
464
+ def create_vectorstore(text_chunks, embeddings):
465
+ temp_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
466
+ if os.path.exists(temp_dir):
467
+ shutil.rmtree(temp_dir)
468
+ os.makedirs(temp_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
+ # Wrap each chunk in a Document object
471
+ documents = [Document(page_content=chunk) for chunk in text_chunks]
472
+ db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
473
  db.persist()
474
+ return db
475
+
476
+ # --- Smart Chunking ---
477
+ def chunk_text(full_text):
478
+ splitter = RecursiveCharacterTextSplitter(
479
+ chunk_size=1000,
480
+ chunk_overlap=150,
481
+ separators=["\n\n", "\n", ".", "!", "?", " ", ""]
482
+ )
483
+ return splitter.split_text(full_text)
484
+
485
+ # --- Answering Logic ---
486
+ def process_question(question, full_text):
487
+ if not full_text:
488
+ return "No valid text extracted from PDF."
489
+
490
+ text_chunks = chunk_text(full_text)
491
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
492
+ vectorstore = create_vectorstore(text_chunks, embeddings)
493
+ retriever = vectorstore.as_retriever()
494
 
495
+ llm = load_llm()
496
+ qa = RetrievalQA.from_chain_type(
497
+ llm=llm,
498
+ retriever=retriever,
499
+ chain_type="stuff",
500
+ return_source_documents=False,
501
+ chain_type_kwargs={
502
+ "prompt": f"""You are a helpful assistant. Answer the user's question based only on the provided document content.
503
 
504
+ If the answer is clearly stated in the document, respond accurately and directly.
 
 
 
505
 
506
+ If not, say "The document does not provide enough information." Do not make things up.
 
 
 
 
507
 
508
+ Question: {question}
509
+ Context: {{context}}
510
+ Answer:"""
511
+ }
512
+ )
513
 
514
+ return qa.run(question)
515
 
516
+ # --- Streamlit UI ---
517
  with st.sidebar:
518
  st.header("πŸ“„ Upload PDF")
519
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
520
 
 
521
  if uploaded_file:
522
+ st.success(f"Uploaded: {uploaded_file.name}")
523
+ full_text = extract_text_from_pdf(uploaded_file)
524
 
525
  if full_text:
526
  st.subheader("πŸ“ PDF Preview")
527
+ with st.expander("πŸ“ View Extracted Text"):
528
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
529
 
530
+ st.subheader("πŸ’¬ Ask your question")
531
+ user_question = st.text_input("Enter your question about the PDF")
 
 
 
532
 
533
  if user_question:
534
+ with st.spinner("πŸ€– Generating Answer..."):
535
+ answer = process_question(user_question, full_text)
536
  st.markdown("### πŸ€– Answer")
537
  st.write(answer)
538
 
539
  with st.sidebar:
540
  st.markdown("---")
541
  st.markdown("**πŸ’‘ Suggestions:**")
542
+ st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
 
543
  st.markdown("""
544
  - "Summarize this document"
545
+ - "What is the background of Pradeep Singh Sengar?"
546
+ - "What experience does he have?"
547
+ - "List key skills mentioned in the document."
548
  """)
 
549
  else:
550
+ st.error("❌ No extractable text found in this PDF. Try another file.")
551
  else:
552
  st.info("Upload a PDF to begin.")
553
 
554
+