pradeepsengarr commited on
Commit
528bb27
Β·
verified Β·
1 Parent(s): cb0ff81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -86
app.py CHANGED
@@ -1,127 +1,118 @@
1
  import os
2
- import shutil
3
- import tempfile
4
- import fitz # PyMuPDF
5
  import streamlit as st
6
- import logging
7
-
 
8
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
10
  from langchain_community.vectorstores import Chroma
11
- from langchain_community.embeddings import SentenceTransformerEmbeddings
12
  from langchain.chains import RetrievalQA
13
  from langchain_community.llms import HuggingFacePipeline
14
  from langchain.prompts import PromptTemplate
15
- from langchain_community.document_loaders import TextLoader
16
-
17
- # --- Streamlit Config ---
18
- st.set_page_config(page_title="πŸ“š RAG PDF Chatbot", layout="wide")
19
- st.title("πŸ“š RAG-based PDF Chatbot")
20
 
21
- # --- Logging ---
22
- logging.basicConfig(level=logging.INFO)
 
23
 
24
- # --- Load Model ---
25
  @st.cache_resource
26
- def load_model():
27
  checkpoint = "MBZUAI/LaMini-T5-738M"
28
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
29
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
30
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
31
  return HuggingFacePipeline(pipeline=pipe)
32
 
33
- # --- Extract PDF Text ---
34
- def extract_text_from_pdf(file):
 
 
 
 
 
35
  try:
36
- doc = fitz.open(stream=file.read(), filetype="pdf")
37
- return "\n".join([page.get_text() for page in doc])
 
 
 
38
  except Exception as e:
39
- logging.error(f"Error reading PDF: {e}")
40
  return ""
41
 
42
- # --- Create Chroma Vectorstore Safely ---
43
- def create_vectorstore(documents, embeddings):
44
- temp_dir = tempfile.mkdtemp() # unique, writable temp dir
45
- db = Chroma.from_documents(documents, embedding=embeddings, persist_directory=temp_dir)
46
- return db
47
-
48
- # --- Build RAG QA Chain ---
49
- def build_qa_chain(retriever, llm):
50
- prompt_template = PromptTemplate(
 
 
 
 
 
 
 
51
  input_variables=["context", "question"],
52
- template="""
53
- You are a helpful assistant. Use the context below to answer the user's question as accurately and truthfully as possible.
54
-
55
- Context:
56
- {context}
57
-
58
- Question:
59
- {question}
60
-
61
- Helpful Answer:
62
- """
63
  )
64
- return RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type_kwargs={"prompt": prompt_template})
65
 
66
- # --- Process QA ---
67
- def process_question(question, full_text):
68
- # Write PDF text to temp file
69
- with open("temp_text.txt", "w") as f:
70
- f.write(full_text)
71
 
72
- loader = TextLoader("temp_text.txt")
73
- docs = loader.load()
 
 
74
 
75
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
76
- chunks = text_splitter.split_documents(docs)
77
 
78
- embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
79
- vectorstore = create_vectorstore(chunks, embeddings)
80
- retriever = vectorstore.as_retriever()
 
 
 
81
 
82
- llm = load_model()
83
- qa = build_qa_chain(retriever, llm)
84
- return qa.run(question)
85
 
86
- # --- Sidebar Upload ---
87
  with st.sidebar:
88
- st.header("πŸ“„ Upload your PDF")
89
- uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
90
 
91
- # --- Main Logic ---
92
- if uploaded_file:
93
- st.success(f"Uploaded: {uploaded_file.name}")
94
- full_text = extract_text_from_pdf(uploaded_file)
95
 
96
  if full_text:
97
- with st.expander("πŸ“„ View Extracted PDF Text", expanded=False):
98
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
99
 
100
- st.subheader("πŸ’¬ Ask Something")
101
- user_question = st.text_input("Ask a question about the document")
102
 
103
- if user_question:
104
- with st.spinner("Analyzing..."):
105
- try:
106
- answer = process_question(user_question, full_text)
107
- except Exception as e:
108
- st.error("⚠️ Something went wrong. Try re-uploading the PDF.")
109
- st.stop()
110
  st.markdown("### πŸ€– Answer")
111
  st.write(answer)
112
-
113
- with st.sidebar:
114
- st.markdown("---")
115
- st.caption("πŸ’‘ Sample Questions")
116
- st.markdown("""
117
- - "Summarize the document"
118
- - "What is the experience of Pradeep Singh Sengar?"
119
- - "What are the key points?"
120
- - "Explain in short"
121
- """)
122
  else:
123
- st.error("❌ Could not extract text. Try a different PDF.")
124
  else:
125
- st.info("Upload a PDF to get started.")
126
-
127
 
 
 
 
 
 
1
  import os
 
 
 
2
  import streamlit as st
3
+ import fitz # PyMuPDF
4
+ import tempfile
5
+ import shutil
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import Chroma
 
10
  from langchain.chains import RetrievalQA
11
  from langchain_community.llms import HuggingFacePipeline
12
  from langchain.prompts import PromptTemplate
 
 
 
 
 
13
 
14
+ # --- Streamlit Setup ---
15
+ st.set_page_config(page_title="πŸ“š Accurate RAG PDF Chatbot", layout="wide")
16
+ st.title("πŸ“š Accurate RAG-based PDF Chatbot")
17
 
18
+ # --- Load LLM (You can swap with Phi-2 or Mistral 7B later) ---
19
  @st.cache_resource
20
+ def load_llm():
21
  checkpoint = "MBZUAI/LaMini-T5-738M"
22
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
23
  model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
24
  pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
25
  return HuggingFacePipeline(pipeline=pipe)
26
 
27
+ # --- Load Embeddings ---
28
+ @st.cache_resource
29
+ def load_embeddings():
30
+ return HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
31
+
32
+ # --- PDF Text Extraction ---
33
+ def extract_text_from_pdf(uploaded_file):
34
  try:
35
+ doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
36
+ full_text = ""
37
+ for page in doc:
38
+ full_text += page.get_text()
39
+ return full_text.strip()
40
  except Exception as e:
41
+ st.error(f"❌ Error reading PDF: {e}")
42
  return ""
43
 
44
+ # --- Text Chunking ---
45
+ def chunk_text(full_text):
46
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=300)
47
+ return splitter.create_documents([full_text])
48
+
49
+ # --- Vectorstore Setup (with in-memory temp directory) ---
50
+ def build_vectorstore(chunks, embeddings):
51
+ temp_dir = os.path.join(tempfile.gettempdir(), "chromadb-rag")
52
+ if os.path.exists(temp_dir):
53
+ shutil.rmtree(temp_dir)
54
+ os.makedirs(temp_dir, exist_ok=True)
55
+ return Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=temp_dir)
56
+
57
+ # --- Prompt Template ---
58
+ def get_prompt_template():
59
+ return PromptTemplate(
60
  input_variables=["context", "question"],
61
+ template=(
62
+ "You are a helpful assistant. Answer the question based only on the following context.\n\n"
63
+ "Context:\n{context}\n\n"
64
+ "Question: {question}\n\n"
65
+ "Answer (Be accurate and concise):"
66
+ )
 
 
 
 
 
67
  )
 
68
 
69
+ # --- Answering Logic ---
70
+ def get_answer(question, full_text):
71
+ if not question or not full_text:
72
+ return "⚠️ Please provide both PDF and a question."
 
73
 
74
+ chunks = chunk_text(full_text)
75
+ embeddings = load_embeddings()
76
+ vectorstore = build_vectorstore(chunks, embeddings)
77
+ retriever = vectorstore.as_retriever()
78
 
79
+ llm = load_llm()
 
80
 
81
+ qa_chain = RetrievalQA.from_chain_type(
82
+ llm=llm,
83
+ retriever=retriever,
84
+ chain_type="stuff",
85
+ chain_type_kwargs={"prompt": get_prompt_template()}
86
+ )
87
 
88
+ return qa_chain.run(question)
 
 
89
 
90
+ # --- UI ---
91
  with st.sidebar:
92
+ st.header("πŸ“„ Upload PDF")
93
+ uploaded_pdf = st.file_uploader("Upload your PDF", type=["pdf"])
94
 
95
+ if uploaded_pdf:
96
+ st.success(f"βœ… Uploaded: {uploaded_pdf.name}")
97
+ full_text = extract_text_from_pdf(uploaded_pdf)
 
98
 
99
  if full_text:
100
+ with st.expander("πŸ“„ Preview PDF Text", expanded=False):
101
  st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))
102
 
103
+ question = st.text_input("❓ Ask a question about this PDF")
 
104
 
105
+ if question:
106
+ with st.spinner("πŸ’­ Generating answer..."):
107
+ answer = get_answer(question, full_text)
 
 
 
 
108
  st.markdown("### πŸ€– Answer")
109
  st.write(answer)
 
 
 
 
 
 
 
 
 
 
110
  else:
111
+ st.error("⚠️ Could not extract any text from the PDF.")
112
  else:
113
+ st.info("πŸ“₯ Upload a PDF to start.")
 
114
 
115
+ with st.sidebar:
116
+ st.markdown("---")
117
+ st.markdown("πŸ’‘ Try questions like:")
118
+ st.caption("β€’ What are the key ideas?\nβ€’ Summarize the document\nβ€’ What is Pradeep Singh Sengar's experience?")