pradeepsengarr commited on
Commit
48058d3
Β·
verified Β·
1 Parent(s): 1eda1ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -99
app.py CHANGED
@@ -422,21 +422,19 @@ import os
422
  import streamlit as st
423
  import fitz # PyMuPDF
424
  import logging
425
- import math
 
426
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
427
  from langchain.text_splitter import RecursiveCharacterTextSplitter
428
  from langchain_community.vectorstores import Chroma
429
  from langchain_community.embeddings import SentenceTransformerEmbeddings
430
  from langchain_community.llms import HuggingFacePipeline
431
  from langchain.chains import RetrievalQA
432
- from langchain.schema import Document
433
- from sentence_transformers import SentenceTransformer
434
- from langchain_community.embeddings import HuggingFaceEmbeddings
435
 
436
  # --- Configuration ---
437
  st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
438
  st.title("πŸ“š RAG-based PDF Chatbot")
439
- persist_directory = "db"
440
  device = "cpu"
441
 
442
  # --- Logging ---
@@ -463,94 +461,8 @@ def read_pdf(file):
463
  logging.error(f"Failed to extract text: {e}")
464
  return ""
465
 
466
- # --- Split Text into Chunks ---
467
- def split_text_into_chunks(text):
468
- splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
469
- return splitter.create_documents([text])
470
-
471
- import os
472
- import shutil
473
- from sentence_transformers import SentenceTransformer
474
- from langchain.embeddings import HuggingFaceEmbeddings
475
- from langchain.vectorstores import Chroma
476
-
477
- # Setup a writable directory for Chroma
478
- chroma_dir = "/home/user/app/chroma_db" # Change this to an absolute writable directory
479
- if os.path.exists(chroma_dir):
480
- shutil.rmtree(chroma_dir) # Clear any old data
481
- os.makedirs(chroma_dir, exist_ok=True)
482
-
483
- # Initialize the model and embeddings
484
- model = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')
485
- embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
486
-
487
- # Create the Chroma database
488
- try:
489
- db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
490
- db.persist()
491
- print(f"Vectorstore created successfully at {chroma_dir}")
492
- except Exception as e:
493
- print(f"Error creating vectorstore: {e}")
494
-
495
-
496
- # --- Setup QA Chain ---
497
- def setup_qa(db):
498
- retriever = db.as_retriever()
499
- llm = load_model()
500
- return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
501
-
502
  # --- Process Answer ---
503
- # def process_answer(question, full_text):
504
- # # STEP 1: Chunk the PDF text
505
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
506
- # docs = text_splitter.create_documents([full_text])
507
-
508
- # # STEP 2: Create embeddings
509
- # embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
510
- # db = Chroma.from_documents(docs, embeddings)
511
-
512
- # # STEP 3: Retrieve relevant chunks using the question
513
- # retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 5})
514
- # relevant_docs = retriever.get_relevant_documents(question)
515
-
516
- # # STEP 4: Format the context
517
- # context = "\n\n".join([doc.page_content for doc in relevant_docs])
518
-
519
- # # STEP 5: Prompting
520
- # prompt_template = """
521
- # You are a helpful assistant that answers questions based on the context below.
522
-
523
- # Context:
524
- # {context}
525
-
526
- # Question: {question}
527
-
528
- # Answer:
529
- # """.strip()
530
-
531
- # prompt = prompt_template.format(context=context, question=question)
532
-
533
- # # STEP 6: Load the model and generate response
534
- # llm = HuggingFacePipeline.from_model_id(
535
- # model_id="MBZUAI/LaMini-T5-738M",
536
- # task="text2text-generation",
537
- # model_kwargs={"temperature": 0.3, "max_length": 256},
538
- # )
539
-
540
- # return llm.invoke(prompt)
541
-
542
- import tempfile
543
- import os
544
-
545
  def process_answer(question, full_text):
546
- from langchain_community.document_loaders import TextLoader
547
- from langchain.text_splitter import RecursiveCharacterTextSplitter
548
- from langchain.vectorstores import Chroma
549
- from langchain_community.embeddings import SentenceTransformerEmbeddings
550
- from langchain.chains import RetrievalQA
551
- from langchain import HuggingFacePipeline
552
- from transformers import pipeline
553
-
554
  # Save the full_text to a temporary file
555
  with open("temp_text.txt", "w") as f:
556
  f.write(full_text)
@@ -568,15 +480,13 @@ def process_answer(question, full_text):
568
  # Create a temporary directory for ChromaDB
569
  chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
570
  if os.path.exists(chroma_dir):
571
- import shutil
572
  shutil.rmtree(chroma_dir)
573
 
574
  db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
575
  retriever = db.as_retriever()
576
 
577
  # Set up the model
578
- pipe = pipeline("text2text-generation", model="MBZUAI/LaMini-T5-738M", max_length=512)
579
- llm = HuggingFacePipeline(pipeline=pipe)
580
 
581
  # RAG-style retrieval QA
582
  qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
@@ -589,12 +499,10 @@ def process_answer(question, full_text):
589
  else:
590
  return qa_chain.run(question)
591
 
592
-
593
  # --- UI Layout ---
594
  with st.sidebar:
595
  st.header("πŸ“„ Upload PDF")
596
  uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
597
-
598
 
599
  # --- Main Interface ---
600
  if uploaded_file:
@@ -602,7 +510,7 @@ if uploaded_file:
602
  full_text = read_pdf(uploaded_file)
603
 
604
  if full_text:
605
- st.subheader("πŸ“‘ PDF Preview")
606
  with st.expander("View Extracted Text"):
607
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
608
 
@@ -618,7 +526,7 @@ if uploaded_file:
618
  with st.sidebar:
619
  st.markdown("---")
620
  st.markdown("**πŸ’‘ Suggestions:**")
621
- st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
622
  with st.expander("πŸ’‘ Suggestions", expanded=True):
623
  st.markdown("""
624
  - "Summarize this document"
@@ -627,7 +535,6 @@ if uploaded_file:
627
  - "Explain this document in short"
628
  """)
629
 
630
-
631
  else:
632
  st.error("⚠️ No text could be extracted from the PDF. Try another file.")
633
  else:
 
422
  import streamlit as st
423
  import fitz # PyMuPDF
424
  import logging
425
+ import tempfile
426
+ import shutil
427
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
428
  from langchain.text_splitter import RecursiveCharacterTextSplitter
429
  from langchain_community.vectorstores import Chroma
430
  from langchain_community.embeddings import SentenceTransformerEmbeddings
431
  from langchain_community.llms import HuggingFacePipeline
432
  from langchain.chains import RetrievalQA
433
+ from langchain_community.document_loaders import TextLoader
 
 
434
 
435
  # --- Configuration ---
436
  st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
437
  st.title("πŸ“š RAG-based PDF Chatbot")
 
438
  device = "cpu"
439
 
440
  # --- Logging ---
 
461
  logging.error(f"Failed to extract text: {e}")
462
  return ""
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  # --- Process Answer ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  def process_answer(question, full_text):
 
 
 
 
 
 
 
 
466
  # Save the full_text to a temporary file
467
  with open("temp_text.txt", "w") as f:
468
  f.write(full_text)
 
480
  # Create a temporary directory for ChromaDB
481
  chroma_dir = os.path.join(tempfile.gettempdir(), "chroma_db")
482
  if os.path.exists(chroma_dir):
 
483
  shutil.rmtree(chroma_dir)
484
 
485
  db = Chroma.from_documents(splits, embeddings, persist_directory=chroma_dir)
486
  retriever = db.as_retriever()
487
 
488
  # Set up the model
489
+ llm = load_model()
 
490
 
491
  # RAG-style retrieval QA
492
  qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
 
499
  else:
500
  return qa_chain.run(question)
501
 
 
502
  # --- UI Layout ---
503
  with st.sidebar:
504
  st.header("πŸ“„ Upload PDF")
505
  uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
 
506
 
507
  # --- Main Interface ---
508
  if uploaded_file:
 
510
  full_text = read_pdf(uploaded_file)
511
 
512
  if full_text:
513
+ st.subheader("πŸ“ PDF Preview")
514
  with st.expander("View Extracted Text"):
515
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
516
 
 
526
  with st.sidebar:
527
  st.markdown("---")
528
  st.markdown("**πŸ’‘ Suggestions:**")
529
+ st.caption("Try: \"Summarize this document\" or \"What is the key idea?\")
530
  with st.expander("πŸ’‘ Suggestions", expanded=True):
531
  st.markdown("""
532
  - "Summarize this document"
 
535
  - "Explain this document in short"
536
  """)
537
 
 
538
  else:
539
  st.error("⚠️ No text could be extracted from the PDF. Try another file.")
540
  else: