Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Full app setup in one script (modularized)
|
2 |
+
# Required Libraries
|
3 |
+
import streamlit as st
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
6 |
+
from langchain.vectorstores import FAISS
|
7 |
+
from langchain.text_splitter import CharacterTextSplitter
|
8 |
+
from langchain.chains import RetrievalQA, LLMChain
|
9 |
+
from langchain.prompts import PromptTemplate
|
10 |
+
from langchain_google_genai import GoogleGenerativeAI
|
11 |
+
import os
|
12 |
+
import pandas as pd
|
13 |
+
import plotly.express as px
|
14 |
+
import uuid
|
15 |
+
import base64
|
16 |
+
import tempfile
|
17 |
+
import fitz # PyMuPDF
|
18 |
+
from docx import Document
|
19 |
+
import google.generativeai as genai
|
20 |
+
from google.api_core.exceptions import InvalidArgument
|
21 |
+
from dotenv import load_dotenv
|
22 |
+
|
23 |
+
load_dotenv()
|
24 |
+
|
25 |
+
st.set_page_config(layout="wide")
|
26 |
+
st.title("π PDF QA App")
|
27 |
+
|
28 |
+
# Initialize session state for uploaded files
|
29 |
+
if "uploaded_files" not in st.session_state:
|
30 |
+
st.session_state.uploaded_files = []
|
31 |
+
|
32 |
+
# Initialize Gemini model
|
33 |
+
@st.cache_resource
|
34 |
+
def load_gemini_model():
|
35 |
+
# You'll need to get an API key from Google AI Studio
|
36 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
37 |
+
|
38 |
+
if not api_key:
|
39 |
+
api_key = st.text_input("Enter your Google API Key", type="password")
|
40 |
+
if not api_key:
|
41 |
+
st.warning("Please enter a Google API key to continue")
|
42 |
+
st.stop()
|
43 |
+
|
44 |
+
# Configure the Gemini model
|
45 |
+
try:
|
46 |
+
# Configure the genai module
|
47 |
+
genai.configure(api_key=api_key)
|
48 |
+
|
49 |
+
# Verify available models
|
50 |
+
models = genai.list_models()
|
51 |
+
available_models = [m.name for m in models]
|
52 |
+
|
53 |
+
# Check which model is available and select the appropriate one
|
54 |
+
gemini_model_name = None
|
55 |
+
for model_option in ["gemini-1.5-pro", "gemini-pro", "gemini-1.0-pro"]:
|
56 |
+
if any(model_option in model for model in available_models):
|
57 |
+
gemini_model_name = model_option
|
58 |
+
break
|
59 |
+
|
60 |
+
if not gemini_model_name:
|
61 |
+
st.error(f"No Gemini model found. Available models: {available_models}")
|
62 |
+
st.stop()
|
63 |
+
|
64 |
+
st.success(f"Using Gemini model: {gemini_model_name}")
|
65 |
+
|
66 |
+
# Initialize the LangChain wrapper for Gemini
|
67 |
+
llm = GoogleGenerativeAI(
|
68 |
+
model=gemini_model_name,
|
69 |
+
google_api_key=api_key,
|
70 |
+
temperature=0.3,
|
71 |
+
max_output_tokens=512
|
72 |
+
)
|
73 |
+
return llm
|
74 |
+
except Exception as e:
|
75 |
+
st.error(f"Error initializing Gemini model: {str(e)}")
|
76 |
+
st.stop()
|
77 |
+
|
78 |
+
# Session state for chat history
|
79 |
+
if "chat_history" not in st.session_state:
|
80 |
+
st.session_state.chat_history = []
|
81 |
+
if "analytics" not in st.session_state:
|
82 |
+
st.session_state.analytics = []
|
83 |
+
|
84 |
+
# File uploader
|
85 |
+
pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multiple_files=True)
|
86 |
+
|
87 |
+
# Store uploaded files in session state for later use
|
88 |
+
if pdf_files:
|
89 |
+
st.session_state.uploaded_files = pdf_files
|
90 |
+
|
91 |
+
# Interactive PDF Viewer
|
92 |
+
with st.expander("π PDF Viewer", expanded=False):
|
93 |
+
try:
|
94 |
+
if st.session_state.uploaded_files:
|
95 |
+
# Display the uploaded files in a selection box
|
96 |
+
pdf_file_names = [uploaded_file.name for uploaded_file in st.session_state.uploaded_files]
|
97 |
+
pdf_file_names.insert(0, "Select PDF File")
|
98 |
+
selected_pdf = st.selectbox("Select a PDF to view", pdf_file_names)
|
99 |
+
|
100 |
+
# Retrieve the selected PDF file
|
101 |
+
selected_file = None
|
102 |
+
for uploaded_file in st.session_state.uploaded_files:
|
103 |
+
if uploaded_file.name == selected_pdf:
|
104 |
+
selected_file = uploaded_file
|
105 |
+
break
|
106 |
+
|
107 |
+
# Display the selected PDF
|
108 |
+
if selected_file and selected_pdf != "Select PDF File":
|
109 |
+
st.subheader(f"Viewing PDF: {selected_pdf}")
|
110 |
+
|
111 |
+
# Read PDF file
|
112 |
+
selected_file.seek(0) # Reset file pointer to start
|
113 |
+
pdf_bytes = selected_file.read()
|
114 |
+
selected_file.seek(0) # Reset file pointer after reading
|
115 |
+
|
116 |
+
# Encode the PDF file in base64 for displaying in iframe
|
117 |
+
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
118 |
+
|
119 |
+
# Display the PDF file in an iframe using an HTML embed
|
120 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{pdf_base64}" width="100%" height="600" type="application/pdf"></iframe>'
|
121 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
122 |
+
except Exception as e:
|
123 |
+
st.error(f"Error displaying PDF: {str(e)}")
|
124 |
+
|
125 |
+
question = st.text_input("Ask a question across PDFs")
|
126 |
+
|
127 |
+
# Helper: Save files to temp and chunk
|
128 |
+
def load_and_chunk(file):
|
129 |
+
# Save file pointer position
|
130 |
+
file_pos = file.tell()
|
131 |
+
|
132 |
+
# Reset file pointer to start
|
133 |
+
file.seek(0)
|
134 |
+
|
135 |
+
try:
|
136 |
+
reader = PdfReader(file)
|
137 |
+
all_text, page_map = "", {}
|
138 |
+
for i, page in enumerate(reader.pages):
|
139 |
+
text = page.extract_text()
|
140 |
+
page_map[i] = text
|
141 |
+
all_text += f"\n[Page {i + 1}]\n{text}"
|
142 |
+
|
143 |
+
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
144 |
+
chunks = splitter.split_text(all_text)
|
145 |
+
|
146 |
+
# Reset file pointer to original position
|
147 |
+
file.seek(file_pos)
|
148 |
+
|
149 |
+
return chunks, page_map
|
150 |
+
except Exception as e:
|
151 |
+
st.error(f"Error processing PDF {file.name}: {str(e)}")
|
152 |
+
file.seek(file_pos) # Reset file pointer even if there's an error
|
153 |
+
return [], {}
|
154 |
+
|
155 |
+
# Helper: Create FAISS store
|
156 |
+
def embed_documents(chunks):
|
157 |
+
# Use HuggingFace embeddings instead of OpenAI
|
158 |
+
try:
|
159 |
+
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
160 |
+
return FAISS.from_texts(chunks, embeddings)
|
161 |
+
except Exception as e:
|
162 |
+
st.error(f"Error creating embeddings: {str(e)}")
|
163 |
+
return None
|
164 |
+
|
165 |
+
# Helper: Display PDF Page (both methods available)
|
166 |
+
def show_pdf_page(file, page_num, use_iframe=False):
|
167 |
+
# Save current position
|
168 |
+
file_pos = file.tell()
|
169 |
+
|
170 |
+
# Reset file pointer
|
171 |
+
file.seek(0)
|
172 |
+
|
173 |
+
try:
|
174 |
+
if use_iframe:
|
175 |
+
# Read the entire PDF
|
176 |
+
pdf_bytes = file.read()
|
177 |
+
# Encode the PDF file in base64 for displaying in iframe
|
178 |
+
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
|
179 |
+
# Display the PDF file in an iframe with page number parameter
|
180 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{pdf_base64}#page={page_num}" width="100%" height="500" type="application/pdf"></iframe>'
|
181 |
+
st.markdown(pdf_display, unsafe_allow_html=True)
|
182 |
+
else:
|
183 |
+
# Original method using PyMuPDF to render as image
|
184 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
185 |
+
tmp.write(file.read())
|
186 |
+
tmp_path = tmp.name
|
187 |
+
|
188 |
+
# Open the saved PDF
|
189 |
+
doc = fitz.open(tmp_path)
|
190 |
+
|
191 |
+
# Validate page number
|
192 |
+
if page_num < 1 or page_num > len(doc):
|
193 |
+
st.error(f"Invalid page number: {page_num}. Document has {len(doc)} pages.")
|
194 |
+
return
|
195 |
+
|
196 |
+
page = doc.load_page(page_num - 1)
|
197 |
+
pix = page.get_pixmap()
|
198 |
+
img_path = tmp_path.replace(".pdf", f"_page{page_num}.png")
|
199 |
+
pix.save(img_path)
|
200 |
+
st.image(img_path, caption=f"Page {page_num}")
|
201 |
+
|
202 |
+
# Clean up
|
203 |
+
doc.close()
|
204 |
+
try:
|
205 |
+
os.unlink(img_path)
|
206 |
+
os.unlink(tmp_path)
|
207 |
+
except Exception as e:
|
208 |
+
pass # Silently handle cleanup errors
|
209 |
+
except Exception as e:
|
210 |
+
st.error(f"Error displaying PDF page: {str(e)}")
|
211 |
+
finally:
|
212 |
+
# Reset file pointer to original position
|
213 |
+
file.seek(file_pos)
|
214 |
+
|
215 |
+
# Helper: Summarize
|
216 |
+
@st.cache_data
|
217 |
+
def summarize_doc(chunks, _llm):
|
218 |
+
summary_prompt = PromptTemplate(
|
219 |
+
input_variables=["context"],
|
220 |
+
template="Summarize this document:\n{context}"
|
221 |
+
)
|
222 |
+
chain = LLMChain(llm=_llm, prompt=summary_prompt)
|
223 |
+
|
224 |
+
# Join only a subset of chunks to avoid token limits
|
225 |
+
full_text = " ".join(chunks[:5]) # Limiting to first 5 chunks
|
226 |
+
|
227 |
+
try:
|
228 |
+
return chain.run({"context": full_text})
|
229 |
+
except Exception as e:
|
230 |
+
st.error(f"Error during summarization: {str(e)}")
|
231 |
+
return "Error: Document too large to summarize or API error. Try with fewer pages."
|
232 |
+
|
233 |
+
|
234 |
+
# Initialize model and DBs
|
235 |
+
try:
|
236 |
+
llm = load_gemini_model()
|
237 |
+
file_chunks, vector_dbs, page_maps = {}, {}, {}
|
238 |
+
|
239 |
+
if pdf_files:
|
240 |
+
with st.spinner("Processing PDF files..."):
|
241 |
+
for file in pdf_files:
|
242 |
+
chunks, page_map = load_and_chunk(file)
|
243 |
+
if chunks: # Only create db if chunks were successfully extracted
|
244 |
+
db = embed_documents(chunks)
|
245 |
+
if db: # Only store if db was successfully created
|
246 |
+
file_chunks[file.name] = chunks
|
247 |
+
page_maps[file.name] = page_map
|
248 |
+
vector_dbs[file.name] = db
|
249 |
+
except Exception as e:
|
250 |
+
st.error(f"Error loading model or processing files: {str(e)}")
|
251 |
+
|
252 |
+
# Document Summarization UI
|
253 |
+
if pdf_files and file_chunks:
|
254 |
+
with st.expander("π Document Summarization"):
|
255 |
+
summarize_option = st.selectbox("Select a document to summarize",
|
256 |
+
["All"] + [f.name for f in pdf_files if f.name in file_chunks])
|
257 |
+
if st.button("Summarize"):
|
258 |
+
with st.spinner("Summarizing..."):
|
259 |
+
try:
|
260 |
+
if summarize_option == "All":
|
261 |
+
for file in pdf_files:
|
262 |
+
if file.name in file_chunks:
|
263 |
+
summary = summarize_doc(file_chunks[file.name], llm)
|
264 |
+
st.subheader(file.name)
|
265 |
+
st.write(summary)
|
266 |
+
else:
|
267 |
+
f = next(f for f in pdf_files if f.name == summarize_option)
|
268 |
+
summary = summarize_doc(file_chunks[f.name], llm)
|
269 |
+
st.subheader(f.name)
|
270 |
+
st.write(summary)
|
271 |
+
except Exception as e:
|
272 |
+
st.error(f"Error during summarization: {str(e)}")
|
273 |
+
|
274 |
+
# Question Answering UI
|
275 |
+
results = []
|
276 |
+
if question and vector_dbs:
|
277 |
+
try:
|
278 |
+
for fname, db in vector_dbs.items():
|
279 |
+
qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
|
280 |
+
|
281 |
+
try:
|
282 |
+
result = qa({"query": question})
|
283 |
+
answer = result['result']
|
284 |
+
|
285 |
+
context_docs = db.similarity_search(question, k=1)
|
286 |
+
if context_docs:
|
287 |
+
context = context_docs[0].page_content
|
288 |
+
|
289 |
+
# Extract page number safely
|
290 |
+
page_num = "Unknown"
|
291 |
+
try:
|
292 |
+
page_num_match = context.split("[Page ")
|
293 |
+
if len(page_num_match) > 1:
|
294 |
+
page_num = page_num_match[1].split("]")[0]
|
295 |
+
except:
|
296 |
+
pass
|
297 |
+
|
298 |
+
st.markdown(f"### π {fname} (Page {page_num})")
|
299 |
+
#st.write(highlight_text(context, answer))
|
300 |
+
st.write(answer)
|
301 |
+
|
302 |
+
|
303 |
+
|
304 |
+
st.session_state.chat_history.append({
|
305 |
+
"file": fname,
|
306 |
+
"page": page_num,
|
307 |
+
"question": question,
|
308 |
+
"answer": answer
|
309 |
+
})
|
310 |
+
|
311 |
+
st.session_state.analytics.append({
|
312 |
+
"file": fname,
|
313 |
+
"page": int(page_num) if page_num.isdigit() else 0,
|
314 |
+
"confidence": 0.9,
|
315 |
+
"question": question
|
316 |
+
})
|
317 |
+
|
318 |
+
results.append((fname, page_num, question, answer))
|
319 |
+
except Exception as e:
|
320 |
+
st.error(f"Error processing question for {fname}: {str(e)}")
|
321 |
+
except Exception as e:
|
322 |
+
st.error(f"Error during question answering: {str(e)}")
|
323 |
+
|
324 |
+
# Chat History Panel
|
325 |
+
if st.session_state.chat_history:
|
326 |
+
with st.expander("π¬ Chat History"):
|
327 |
+
for entry in st.session_state.chat_history[::-1]:
|
328 |
+
st.markdown(f"**{entry['file']}** | Page {entry['page']}\n> {entry['question']}\nβ {entry['answer']}")
|
329 |
+
|
330 |
+
# Downloadable Report
|
331 |
+
if results:
|
332 |
+
with st.expander("π₯ Download Q&A Report"):
|
333 |
+
docx = Document()
|
334 |
+
docx.add_heading("PDF QA Report", 0)
|
335 |
+
for fname, page, q, a in results:
|
336 |
+
docx.add_paragraph(f"File: {fname} | Page: {page}", style="List Bullet")
|
337 |
+
docx.add_paragraph(f"Q: {q}")
|
338 |
+
docx.add_paragraph(f"A: {a}\n")
|
339 |
+
|
340 |
+
try:
|
341 |
+
docx_path = os.path.join(tempfile.gettempdir(), f"report_{uuid.uuid4()}.docx")
|
342 |
+
docx.save(docx_path)
|
343 |
+
with open(docx_path, "rb") as f:
|
344 |
+
b64 = base64.b64encode(f.read()).decode()
|
345 |
+
st.markdown(f"[Download DOCX Report](data:application/octet-stream;base64,{b64})", unsafe_allow_html=True)
|
346 |
+
# Clean up temporary files
|
347 |
+
try:
|
348 |
+
os.unlink(docx_path)
|
349 |
+
except:
|
350 |
+
pass
|
351 |
+
except Exception as e:
|
352 |
+
st.error(f"Error creating downloadable report: {str(e)}")
|
353 |
+
|
354 |
+
# Analytics Dashboard
|
355 |
+
if st.session_state.analytics:
|
356 |
+
with st.expander("π Analytics Dashboard"):
|
357 |
+
df = pd.DataFrame(st.session_state.analytics)
|
358 |
+
col1, col2 = st.columns(2)
|
359 |
+
with col1:
|
360 |
+
st.dataframe(df)
|
361 |
+
with col2:
|
362 |
+
try:
|
363 |
+
fig = px.histogram(df, x="file", color="page", title="Answer Distribution by File")
|
364 |
+
st.plotly_chart(fig, use_container_width=True)
|
365 |
+
except Exception as e:
|
366 |
+
st.error(f"Error generating analytics chart: {str(e)}")
|
367 |
+
|
368 |
+
st.markdown("Use filters below to explore:")
|
369 |
+
file_filter = st.selectbox("Filter by file", ["All"] + list(df["file"].unique()))
|
370 |
+
if file_filter != "All":
|
371 |
+
st.dataframe(df[df["file"] == file_filter])
|