lalitwale100 commited on
Commit
3d9654b
Β·
verified Β·
1 Parent(s): 602e6b6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +371 -0
app.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Full app setup in one script (modularized)
2
+ # Required Libraries
3
+ import streamlit as st
4
+ from PyPDF2 import PdfReader
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.text_splitter import CharacterTextSplitter
8
+ from langchain.chains import RetrievalQA, LLMChain
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain_google_genai import GoogleGenerativeAI
11
+ import os
12
+ import pandas as pd
13
+ import plotly.express as px
14
+ import uuid
15
+ import base64
16
+ import tempfile
17
+ import fitz # PyMuPDF
18
+ from docx import Document
19
+ import google.generativeai as genai
20
+ from google.api_core.exceptions import InvalidArgument
21
+ from dotenv import load_dotenv
22
+
23
+ load_dotenv()
24
+
25
+ st.set_page_config(layout="wide")
26
+ st.title("πŸ“š PDF QA App")
27
+
28
+ # Initialize session state for uploaded files
29
+ if "uploaded_files" not in st.session_state:
30
+ st.session_state.uploaded_files = []
31
+
32
+ # Initialize Gemini model
33
+ @st.cache_resource
34
+ def load_gemini_model():
35
+ # You'll need to get an API key from Google AI Studio
36
+ api_key = os.getenv("GOOGLE_API_KEY")
37
+
38
+ if not api_key:
39
+ api_key = st.text_input("Enter your Google API Key", type="password")
40
+ if not api_key:
41
+ st.warning("Please enter a Google API key to continue")
42
+ st.stop()
43
+
44
+ # Configure the Gemini model
45
+ try:
46
+ # Configure the genai module
47
+ genai.configure(api_key=api_key)
48
+
49
+ # Verify available models
50
+ models = genai.list_models()
51
+ available_models = [m.name for m in models]
52
+
53
+ # Check which model is available and select the appropriate one
54
+ gemini_model_name = None
55
+ for model_option in ["gemini-1.5-pro", "gemini-pro", "gemini-1.0-pro"]:
56
+ if any(model_option in model for model in available_models):
57
+ gemini_model_name = model_option
58
+ break
59
+
60
+ if not gemini_model_name:
61
+ st.error(f"No Gemini model found. Available models: {available_models}")
62
+ st.stop()
63
+
64
+ st.success(f"Using Gemini model: {gemini_model_name}")
65
+
66
+ # Initialize the LangChain wrapper for Gemini
67
+ llm = GoogleGenerativeAI(
68
+ model=gemini_model_name,
69
+ google_api_key=api_key,
70
+ temperature=0.3,
71
+ max_output_tokens=512
72
+ )
73
+ return llm
74
+ except Exception as e:
75
+ st.error(f"Error initializing Gemini model: {str(e)}")
76
+ st.stop()
77
+
78
+ # Session state for chat history
79
+ if "chat_history" not in st.session_state:
80
+ st.session_state.chat_history = []
81
+ if "analytics" not in st.session_state:
82
+ st.session_state.analytics = []
83
+
84
+ # File uploader
85
+ pdf_files = st.file_uploader("Upload one or more PDFs", type="pdf", accept_multiple_files=True)
86
+
87
+ # Store uploaded files in session state for later use
88
+ if pdf_files:
89
+ st.session_state.uploaded_files = pdf_files
90
+
91
+ # Interactive PDF Viewer
92
+ with st.expander("πŸ“‘ PDF Viewer", expanded=False):
93
+ try:
94
+ if st.session_state.uploaded_files:
95
+ # Display the uploaded files in a selection box
96
+ pdf_file_names = [uploaded_file.name for uploaded_file in st.session_state.uploaded_files]
97
+ pdf_file_names.insert(0, "Select PDF File")
98
+ selected_pdf = st.selectbox("Select a PDF to view", pdf_file_names)
99
+
100
+ # Retrieve the selected PDF file
101
+ selected_file = None
102
+ for uploaded_file in st.session_state.uploaded_files:
103
+ if uploaded_file.name == selected_pdf:
104
+ selected_file = uploaded_file
105
+ break
106
+
107
+ # Display the selected PDF
108
+ if selected_file and selected_pdf != "Select PDF File":
109
+ st.subheader(f"Viewing PDF: {selected_pdf}")
110
+
111
+ # Read PDF file
112
+ selected_file.seek(0) # Reset file pointer to start
113
+ pdf_bytes = selected_file.read()
114
+ selected_file.seek(0) # Reset file pointer after reading
115
+
116
+ # Encode the PDF file in base64 for displaying in iframe
117
+ pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
118
+
119
+ # Display the PDF file in an iframe using an HTML embed
120
+ pdf_display = f'<iframe src="data:application/pdf;base64,{pdf_base64}" width="100%" height="600" type="application/pdf"></iframe>'
121
+ st.markdown(pdf_display, unsafe_allow_html=True)
122
+ except Exception as e:
123
+ st.error(f"Error displaying PDF: {str(e)}")
124
+
125
+ question = st.text_input("Ask a question across PDFs")
126
+
127
+ # Helper: Save files to temp and chunk
128
+ def load_and_chunk(file):
129
+ # Save file pointer position
130
+ file_pos = file.tell()
131
+
132
+ # Reset file pointer to start
133
+ file.seek(0)
134
+
135
+ try:
136
+ reader = PdfReader(file)
137
+ all_text, page_map = "", {}
138
+ for i, page in enumerate(reader.pages):
139
+ text = page.extract_text()
140
+ page_map[i] = text
141
+ all_text += f"\n[Page {i + 1}]\n{text}"
142
+
143
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
144
+ chunks = splitter.split_text(all_text)
145
+
146
+ # Reset file pointer to original position
147
+ file.seek(file_pos)
148
+
149
+ return chunks, page_map
150
+ except Exception as e:
151
+ st.error(f"Error processing PDF {file.name}: {str(e)}")
152
+ file.seek(file_pos) # Reset file pointer even if there's an error
153
+ return [], {}
154
+
155
+ # Helper: Create FAISS store
156
+ def embed_documents(chunks):
157
+ # Use HuggingFace embeddings instead of OpenAI
158
+ try:
159
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
160
+ return FAISS.from_texts(chunks, embeddings)
161
+ except Exception as e:
162
+ st.error(f"Error creating embeddings: {str(e)}")
163
+ return None
164
+
165
+ # Helper: Display PDF Page (both methods available)
166
+ def show_pdf_page(file, page_num, use_iframe=False):
167
+ # Save current position
168
+ file_pos = file.tell()
169
+
170
+ # Reset file pointer
171
+ file.seek(0)
172
+
173
+ try:
174
+ if use_iframe:
175
+ # Read the entire PDF
176
+ pdf_bytes = file.read()
177
+ # Encode the PDF file in base64 for displaying in iframe
178
+ pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
179
+ # Display the PDF file in an iframe with page number parameter
180
+ pdf_display = f'<iframe src="data:application/pdf;base64,{pdf_base64}#page={page_num}" width="100%" height="500" type="application/pdf"></iframe>'
181
+ st.markdown(pdf_display, unsafe_allow_html=True)
182
+ else:
183
+ # Original method using PyMuPDF to render as image
184
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
185
+ tmp.write(file.read())
186
+ tmp_path = tmp.name
187
+
188
+ # Open the saved PDF
189
+ doc = fitz.open(tmp_path)
190
+
191
+ # Validate page number
192
+ if page_num < 1 or page_num > len(doc):
193
+ st.error(f"Invalid page number: {page_num}. Document has {len(doc)} pages.")
194
+ return
195
+
196
+ page = doc.load_page(page_num - 1)
197
+ pix = page.get_pixmap()
198
+ img_path = tmp_path.replace(".pdf", f"_page{page_num}.png")
199
+ pix.save(img_path)
200
+ st.image(img_path, caption=f"Page {page_num}")
201
+
202
+ # Clean up
203
+ doc.close()
204
+ try:
205
+ os.unlink(img_path)
206
+ os.unlink(tmp_path)
207
+ except Exception as e:
208
+ pass # Silently handle cleanup errors
209
+ except Exception as e:
210
+ st.error(f"Error displaying PDF page: {str(e)}")
211
+ finally:
212
+ # Reset file pointer to original position
213
+ file.seek(file_pos)
214
+
215
+ # Helper: Summarize
216
+ @st.cache_data
217
+ def summarize_doc(chunks, _llm):
218
+ summary_prompt = PromptTemplate(
219
+ input_variables=["context"],
220
+ template="Summarize this document:\n{context}"
221
+ )
222
+ chain = LLMChain(llm=_llm, prompt=summary_prompt)
223
+
224
+ # Join only a subset of chunks to avoid token limits
225
+ full_text = " ".join(chunks[:5]) # Limiting to first 5 chunks
226
+
227
+ try:
228
+ return chain.run({"context": full_text})
229
+ except Exception as e:
230
+ st.error(f"Error during summarization: {str(e)}")
231
+ return "Error: Document too large to summarize or API error. Try with fewer pages."
232
+
233
+
234
+ # Initialize model and DBs
235
+ try:
236
+ llm = load_gemini_model()
237
+ file_chunks, vector_dbs, page_maps = {}, {}, {}
238
+
239
+ if pdf_files:
240
+ with st.spinner("Processing PDF files..."):
241
+ for file in pdf_files:
242
+ chunks, page_map = load_and_chunk(file)
243
+ if chunks: # Only create db if chunks were successfully extracted
244
+ db = embed_documents(chunks)
245
+ if db: # Only store if db was successfully created
246
+ file_chunks[file.name] = chunks
247
+ page_maps[file.name] = page_map
248
+ vector_dbs[file.name] = db
249
+ except Exception as e:
250
+ st.error(f"Error loading model or processing files: {str(e)}")
251
+
252
+ # Document Summarization UI
253
+ if pdf_files and file_chunks:
254
+ with st.expander("πŸ“„ Document Summarization"):
255
+ summarize_option = st.selectbox("Select a document to summarize",
256
+ ["All"] + [f.name for f in pdf_files if f.name in file_chunks])
257
+ if st.button("Summarize"):
258
+ with st.spinner("Summarizing..."):
259
+ try:
260
+ if summarize_option == "All":
261
+ for file in pdf_files:
262
+ if file.name in file_chunks:
263
+ summary = summarize_doc(file_chunks[file.name], llm)
264
+ st.subheader(file.name)
265
+ st.write(summary)
266
+ else:
267
+ f = next(f for f in pdf_files if f.name == summarize_option)
268
+ summary = summarize_doc(file_chunks[f.name], llm)
269
+ st.subheader(f.name)
270
+ st.write(summary)
271
+ except Exception as e:
272
+ st.error(f"Error during summarization: {str(e)}")
273
+
274
+ # Question Answering UI
275
+ results = []
276
+ if question and vector_dbs:
277
+ try:
278
+ for fname, db in vector_dbs.items():
279
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
280
+
281
+ try:
282
+ result = qa({"query": question})
283
+ answer = result['result']
284
+
285
+ context_docs = db.similarity_search(question, k=1)
286
+ if context_docs:
287
+ context = context_docs[0].page_content
288
+
289
+ # Extract page number safely
290
+ page_num = "Unknown"
291
+ try:
292
+ page_num_match = context.split("[Page ")
293
+ if len(page_num_match) > 1:
294
+ page_num = page_num_match[1].split("]")[0]
295
+ except:
296
+ pass
297
+
298
+ st.markdown(f"### πŸ“˜ {fname} (Page {page_num})")
299
+ #st.write(highlight_text(context, answer))
300
+ st.write(answer)
301
+
302
+
303
+
304
+ st.session_state.chat_history.append({
305
+ "file": fname,
306
+ "page": page_num,
307
+ "question": question,
308
+ "answer": answer
309
+ })
310
+
311
+ st.session_state.analytics.append({
312
+ "file": fname,
313
+ "page": int(page_num) if page_num.isdigit() else 0,
314
+ "confidence": 0.9,
315
+ "question": question
316
+ })
317
+
318
+ results.append((fname, page_num, question, answer))
319
+ except Exception as e:
320
+ st.error(f"Error processing question for {fname}: {str(e)}")
321
+ except Exception as e:
322
+ st.error(f"Error during question answering: {str(e)}")
323
+
324
+ # Chat History Panel
325
+ if st.session_state.chat_history:
326
+ with st.expander("πŸ’¬ Chat History"):
327
+ for entry in st.session_state.chat_history[::-1]:
328
+ st.markdown(f"**{entry['file']}** | Page {entry['page']}\n> {entry['question']}\n→ {entry['answer']}")
329
+
330
+ # Downloadable Report
331
+ if results:
332
+ with st.expander("πŸ“₯ Download Q&A Report"):
333
+ docx = Document()
334
+ docx.add_heading("PDF QA Report", 0)
335
+ for fname, page, q, a in results:
336
+ docx.add_paragraph(f"File: {fname} | Page: {page}", style="List Bullet")
337
+ docx.add_paragraph(f"Q: {q}")
338
+ docx.add_paragraph(f"A: {a}\n")
339
+
340
+ try:
341
+ docx_path = os.path.join(tempfile.gettempdir(), f"report_{uuid.uuid4()}.docx")
342
+ docx.save(docx_path)
343
+ with open(docx_path, "rb") as f:
344
+ b64 = base64.b64encode(f.read()).decode()
345
+ st.markdown(f"[Download DOCX Report](data:application/octet-stream;base64,{b64})", unsafe_allow_html=True)
346
+ # Clean up temporary files
347
+ try:
348
+ os.unlink(docx_path)
349
+ except:
350
+ pass
351
+ except Exception as e:
352
+ st.error(f"Error creating downloadable report: {str(e)}")
353
+
354
+ # Analytics Dashboard
355
+ if st.session_state.analytics:
356
+ with st.expander("πŸ“Š Analytics Dashboard"):
357
+ df = pd.DataFrame(st.session_state.analytics)
358
+ col1, col2 = st.columns(2)
359
+ with col1:
360
+ st.dataframe(df)
361
+ with col2:
362
+ try:
363
+ fig = px.histogram(df, x="file", color="page", title="Answer Distribution by File")
364
+ st.plotly_chart(fig, use_container_width=True)
365
+ except Exception as e:
366
+ st.error(f"Error generating analytics chart: {str(e)}")
367
+
368
+ st.markdown("Use filters below to explore:")
369
+ file_filter = st.selectbox("Filter by file", ["All"] + list(df["file"].unique()))
370
+ if file_filter != "All":
371
+ st.dataframe(df[df["file"] == file_filter])