pradeepsengarr commited on
Commit
0b64652
Β·
verified Β·
1 Parent(s): 528bb27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -73
app.py CHANGED
@@ -1,118 +1,126 @@
1
  import os
2
  import streamlit as st
3
  import fitz # PyMuPDF
4
- import tempfile
5
- import shutil
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import Chroma
10
- from langchain.chains import RetrievalQA
11
  from langchain_community.llms import HuggingFacePipeline
 
12
  from langchain.prompts import PromptTemplate
 
13
 
14
- # --- Streamlit Setup ---
15
- st.set_page_config(page_title="πŸ“š Accurate RAG PDF Chatbot", layout="wide")
16
- st.title("πŸ“š Accurate RAG-based PDF Chatbot")
 
17
 
18
- # --- Load LLM (You can swap with Phi-2 or Mistral 7B later) ---
 
 
 
19
  @st.cache_resource
20
- def load_llm():
21
  checkpoint = "MBZUAI/LaMini-T5-738M"
22
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
23
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
24
- pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
25
  return HuggingFacePipeline(pipeline=pipe)
26
 
27
- # --- Load Embeddings ---
28
- @st.cache_resource
29
- def load_embeddings():
30
- return HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
31
-
32
- # --- PDF Text Extraction ---
33
- def extract_text_from_pdf(uploaded_file):
34
  try:
35
- doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
36
- full_text = ""
37
  for page in doc:
38
- full_text += page.get_text()
39
- return full_text.strip()
40
  except Exception as e:
41
- st.error(f"❌ Error reading PDF: {e}")
42
  return ""
43
 
44
- # --- Text Chunking ---
45
- def chunk_text(full_text):
46
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
47
- return splitter.create_documents([full_text])
48
-
49
- # --- Vectorstore Setup (with in-memory temp directory) ---
50
- def build_vectorstore(chunks, embeddings):
51
- temp_dir = os.path.join(tempfile.gettempdir(), "chromadb-rag")
52
- if os.path.exists(temp_dir):
53
- shutil.rmtree(temp_dir)
54
- os.makedirs(temp_dir, exist_ok=True)
55
- return Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=temp_dir)
56
-
57
- # --- Prompt Template ---
58
- def get_prompt_template():
59
- return PromptTemplate(
60
- input_variables=["context", "question"],
61
- template=(
62
- "You are a helpful assistant. Answer the question based only on the following context.\n\n"
63
- "Context:\n{context}\n\n"
64
- "Question: {question}\n\n"
65
- "Answer (Be accurate and concise):"
66
- )
67
- )
68
 
69
- # --- Answering Logic ---
70
- def get_answer(question, full_text):
71
- if not question or not full_text:
72
- return "⚠️ Please provide both PDF and a question."
73
 
74
- chunks = chunk_text(full_text)
75
- embeddings = load_embeddings()
76
- vectorstore = build_vectorstore(chunks, embeddings)
77
- retriever = vectorstore.as_retriever()
78
 
79
- llm = load_llm()
 
80
 
 
 
 
 
 
 
 
 
 
 
 
81
  qa_chain = RetrievalQA.from_chain_type(
82
  llm=llm,
83
  retriever=retriever,
84
  chain_type="stuff",
85
- chain_type_kwargs={"prompt": get_prompt_template()}
86
  )
87
 
 
88
  return qa_chain.run(question)
89
 
90
- # --- UI ---
91
  with st.sidebar:
92
  st.header("πŸ“„ Upload PDF")
93
- uploaded_pdf = st.file_uploader("Upload your PDF", type=["pdf"])
94
 
95
- if uploaded_pdf:
96
- st.success(f"βœ… Uploaded: {uploaded_pdf.name}")
97
- full_text = extract_text_from_pdf(uploaded_pdf)
 
98
 
99
  if full_text:
100
- with st.expander("πŸ“„ Preview PDF Text", expanded=False):
 
101
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
102
 
103
- question = st.text_input("❓ Ask a question about this PDF")
 
104
 
105
- if question:
106
- with st.spinner("πŸ’­ Generating answer..."):
107
- answer = get_answer(question, full_text)
108
  st.markdown("### πŸ€– Answer")
109
  st.write(answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
- st.error("⚠️ Could not extract any text from the PDF.")
112
  else:
113
- st.info("πŸ“₯ Upload a PDF to start.")
114
-
115
- with st.sidebar:
116
- st.markdown("---")
117
- st.markdown("πŸ’‘ Try questions like:")
118
- st.caption("β€’ What are the key ideas?\nβ€’ Summarize the document\nβ€’ What is Pradeep Singh Sengar's experience?")
 
1
  import os
2
  import streamlit as st
3
  import fitz # PyMuPDF
4
+ import logging
 
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
7
  from langchain_community.vectorstores import Chroma
8
+ from langchain_community.embeddings import SentenceTransformerEmbeddings
9
  from langchain_community.llms import HuggingFacePipeline
10
+ from langchain.chains import RetrievalQA
11
  from langchain.prompts import PromptTemplate
12
+ from langchain_community.document_loaders import TextLoader
13
 
14
+ # --- Configuration ---
15
+ st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
16
+ st.title("πŸ“š RAG-based PDF Chatbot")
17
+ device = "cpu"
18
 
19
+ # --- Logging ---
20
+ logging.basicConfig(level=logging.INFO)
21
+
22
+ # --- Load LLM ---
23
  @st.cache_resource
24
+ def load_model():
25
  checkpoint = "MBZUAI/LaMini-T5-738M"
26
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
27
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
28
+ pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
29
  return HuggingFacePipeline(pipeline=pipe)
30
 
31
+ # --- Extract PDF Text ---
32
+ def read_pdf(file):
 
 
 
 
 
33
  try:
34
+ doc = fitz.open(stream=file.read(), filetype="pdf")
35
+ text = ""
36
  for page in doc:
37
+ text += page.get_text()
38
+ return text.strip()
39
  except Exception as e:
40
+ logging.error(f"Failed to extract text: {e}")
41
  return ""
42
 
43
+ # --- Process Answer ---
44
+ def process_answer(question, full_text):
45
+ # Save the full_text to a temporary file
46
+ with open("temp_text.txt", "w") as f:
47
+ f.write(full_text)
48
+
49
+ loader = TextLoader("temp_text.txt")
50
+ docs = loader.load()
51
+
52
+ # Chunk the documents with increased size and overlap
53
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
54
+ splits = text_splitter.split_documents(docs)
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # Load embeddings
57
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
 
 
58
 
59
+ # Create Chroma in-memory vector store
60
+ db = Chroma.from_documents(splits, embedding=embeddings)
61
+ retriever = db.as_retriever()
 
62
 
63
+ # Set up the model
64
+ llm = load_model()
65
 
66
+ # Create a custom prompt
67
+ prompt_template = PromptTemplate.from_template("""
68
+ You are a helpful assistant. Use the following context to answer the question as accurately and thoroughly as possible.
69
+
70
+ Context: {context}
71
+
72
+ Question: {question}
73
+
74
+ Answer in detail:""")
75
+
76
+ # Retrieval QA with custom prompt
77
  qa_chain = RetrievalQA.from_chain_type(
78
  llm=llm,
79
  retriever=retriever,
80
  chain_type="stuff",
81
+ chain_type_kwargs={"prompt": prompt_template}
82
  )
83
 
84
+ # Return the answer using the retrieval QA chain
85
  return qa_chain.run(question)
86
 
87
+ # --- UI Layout ---
88
  with st.sidebar:
89
  st.header("πŸ“„ Upload PDF")
90
+ uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])
91
 
92
+ # --- Main Interface ---
93
+ if uploaded_file:
94
+ st.success(f"You uploaded: {uploaded_file.name}")
95
+ full_text = read_pdf(uploaded_file)
96
 
97
  if full_text:
98
+ st.subheader("πŸ“ PDF Preview")
99
+ with st.expander("View Extracted Text"):
100
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
101
 
102
+ st.subheader("πŸ’¬ Ask a Question")
103
+ user_question = st.text_input("Type your question about the PDF content")
104
 
105
+ if user_question:
106
+ with st.spinner("Thinking..."):
107
+ answer = process_answer(user_question, full_text)
108
  st.markdown("### πŸ€– Answer")
109
  st.write(answer)
110
+
111
+ with st.sidebar:
112
+ st.markdown("---")
113
+ st.markdown("**πŸ’‘ Suggestions:**")
114
+ st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"")
115
+ with st.expander("πŸ’‘ Suggestions", expanded=True):
116
+ st.markdown("""
117
+ - "Summarize this document"
118
+ - "Give a quick summary"
119
+ - "What are the main points?"
120
+ - "Explain this document in short"
121
+ """)
122
+
123
  else:
124
+ st.error("⚠️ No text could be extracted from the PDF. Try another file.")
125
  else:
126
+ st.info("Upload a PDF to begin.")