zliang commited on
Commit
746d859
Β·
verified Β·
1 Parent(s): e60ba77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -208
app.py CHANGED
@@ -53,6 +53,16 @@ def handle_errors(func):
53
  st.rerun()
54
  return wrapper
55
 
 
 
 
 
 
 
 
 
 
 
56
  def scroll_to_bottom():
57
  ctx = get_script_run_ctx()
58
  if ctx and runtime.exists():
@@ -73,168 +83,31 @@ def summarize_pdf(_pdf_file_path, num_clusters=10):
73
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
74
  llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
75
 
76
- # Load PDF with page numbers
77
- loader = PyMuPDFLoader(_pdf_file_path)
78
- docs = loader.load()
79
-
80
- # Create chunks with page metadata
81
- text_splitter = SpacyTextSplitter(chunk_size=500)
82
- chunks_with_metadata = []
83
- for doc in docs:
84
- chunks = text_splitter.split_text(doc.page_content)
85
- for chunk in chunks:
86
- chunks_with_metadata.append({
87
- "text": clean_text(chunk),
88
- "page": doc.metadata["page"] + 1 # Convert to 1-based numbering
89
- })
90
-
91
- # Prepare prompt with citation instructions
92
  prompt = ChatPromptTemplate.from_template(
93
- """Generate a comprehensive summary with inline citations using [Source X] format.
94
- Include these elements:
95
  1. Key findings and conclusions
96
  2. Main methodologies used
97
  3. Important data points
98
  4. Limitations mentioned
99
-
100
- Structure your response as:
101
- ## Comprehensive Summary
102
- {summary_content}
103
-
104
- Contexts: {topic}"""
105
  )
106
 
107
- # Generate summary
108
- chain = prompt | llm | StrOutputParser()
109
- raw_summary = chain.invoke({
110
- "topic": ' '.join([chunk["text"] for chunk in chunks_with_metadata])
111
- })
112
-
113
- return generate_interactive_citations(raw_summary, chunks_with_metadata)
114
-
115
- def generate_interactive_citations(summary_text, source_chunks):
116
- # Create source entries with page numbers and full text
117
- sources_html = """<div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e0e0e0;">
118
- <h3 style="color: #2c3e50;">πŸ“– Source References</h3>"""
119
-
120
- source_mapping = {}
121
- for idx, chunk in enumerate(source_chunks):
122
- source_id = f"source-{idx+1}"
123
- source_mapping[idx+1] = {
124
- "id": source_id,
125
- "page": chunk["page"],
126
- "text": chunk["text"]
127
- }
128
-
129
- sources_html += f"""
130
- <div id="{source_id}" style="margin: 1rem 0; padding: 1rem;
131
- border: 1px solid #e0e0e0; border-radius: 8px;
132
- background-color: #f8f9fa; transition: all 0.3s ease;">
133
- <div style="display: flex; justify-content: space-between; align-items: center;">
134
- <div style="font-weight: 600; color: #4CAF50;">Source {idx+1}</div>
135
- <div style="font-size: 0.9em; color: #666;">Page {chunk['page']}</div>
136
- </div>
137
- <div style="margin-top: 0.5rem; color: #444; font-size: 0.95em;">
138
- {chunk["text"]}
139
- </div>
140
- </div>
141
- """
142
-
143
- sources_html += "</div>"
144
-
145
- # Add click interactions
146
- interaction_js = """
147
- <script>
148
- document.querySelectorAll('.citation-link').forEach(item => {
149
- item.addEventListener('click', function(e) {
150
- e.preventDefault();
151
- const sourceId = this.getAttribute('data-source');
152
- const sourceDiv = document.getElementById(sourceId);
153
-
154
- // Highlight animation
155
- sourceDiv.style.transform = 'scale(1.02)';
156
- sourceDiv.style.boxShadow = '0 4px 12px rgba(76,175,80,0.2)';
157
-
158
- setTimeout(() => {
159
- sourceDiv.style.transform = 'none';
160
- sourceDiv.style.boxShadow = 'none';
161
- }, 500);
162
-
163
- // Smooth scroll
164
- sourceDiv.scrollIntoView({behavior: 'smooth', block: 'start'});
165
- });
166
- });
167
- </script>
168
- """
169
-
170
- # Replace citations with interactive links
171
- cited_summary = re.sub(r'\[Source (\d+)\]',
172
- lambda m: f'<a class="citation-link" data-source="source-{m.group(1)}" '
173
- f'style="cursor: pointer; color: #4CAF50; text-decoration: none; '
174
- f'border-bottom: 1px dashed #4CAF50;">[Source {m.group(1)}]</a>',
175
- summary_text)
176
-
177
- return f"""
178
- <div style="margin-bottom: 3rem;">
179
- {cited_summary}
180
- {sources_html}
181
- </div>
182
- {interaction_js}
183
- """
184
-
185
- @st.cache_data(show_spinner=False, ttl=3600)
186
- @handle_errors
187
- def qa_pdf(_pdf_file_path, query, num_clusters=5):
188
- embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
189
- llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
190
-
191
- # Load PDF with page numbers
192
  loader = PyMuPDFLoader(_pdf_file_path)
193
  docs = loader.load()
 
 
194
 
195
- # Create chunks with page metadata
196
  text_splitter = SpacyTextSplitter(chunk_size=500)
197
- chunks_with_metadata = []
198
- for doc in docs:
199
- chunks = text_splitter.split_text(doc.page_content)
200
- for chunk in chunks:
201
- chunks_with_metadata.append({
202
- "text": clean_text(chunk),
203
- "page": doc.metadata["page"] + 1
204
- })
205
-
206
- # Find relevant chunks
207
- embeddings = embeddings_model.embed_documents([chunk["text"] for chunk in chunks_with_metadata])
208
- query_embedding = embeddings_model.embed_query(query)
209
- similarities = cosine_similarity([query_embedding], embeddings)[0]
210
- top_indices = np.argsort(similarities)[-num_clusters:]
211
 
212
- # Prepare prompt with citation instructions
213
- prompt = ChatPromptTemplate.from_template(
214
- """Answer this question with inline citations using [Source X] format:
215
- {question}
216
-
217
- Use these verified sources:
218
- {context}
219
-
220
- Structure your answer with:
221
- - Clear section headings
222
- - Bullet points for lists
223
- - Citations for all factual claims"""
224
- )
225
 
226
  chain = prompt | llm | StrOutputParser()
227
- raw_answer = chain.invoke({
228
- "question": query,
229
- "context": '\n\n'.join([f"Source {i+1} (Page {chunks_with_metadata[i]['page']}): {chunks_with_metadata[i]['text']}"
230
- for i in top_indices])
231
- })
232
-
233
- return generate_interactive_citations(raw_answer, [chunks_with_metadata[i] for i in top_indices])
234
 
235
- # (Keep the rest of the code from previous implementation for PDF processing and UI)
236
- # [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
237
- # [Make sure to maintain all the UI improvements and error handling]
238
  @st.cache_data(show_spinner=False, ttl=3600)
239
  @handle_errors
240
  def qa_pdf(_pdf_file_path, query, num_clusters=5):
@@ -310,73 +183,36 @@ def image_to_base64(img):
310
  img.save(buffered, format="JPEG", quality=85)
311
  return base64.b64encode(buffered.getvalue()).decode()
312
 
313
- # Streamlit UI Configuration
314
  st.set_page_config(
315
- page_title="PDF Research Assistant",
316
  page_icon="πŸ“„",
317
  layout="wide",
318
  initial_sidebar_state="expanded"
319
  )
320
 
321
- # Custom CSS Styles
322
- st.markdown("""
323
- <style>
324
- .citation-link {
325
- transition: all 0.2s ease;
326
- font-weight: 500;
327
- }
328
- .citation-link:hover {
329
- color: #45a049 !important;
330
- border-bottom-color: #45a049 !important;
331
- }
332
- .stChatMessage {
333
- border-radius: 12px;
334
- box-shadow: 0 4px 12px rgba(0,0,0,0.08);
335
- margin: 1.5rem 0;
336
- padding: 1.5rem;
337
- }
338
- .stButton>button {
339
- background: linear-gradient(135deg, #4CAF50, #45a049);
340
- transition: transform 0.2s ease, box-shadow 0.2s ease;
341
- }
342
- .stButton>button:hover {
343
- transform: translateY(-1px);
344
- box-shadow: 0 4px 12px rgba(76,175,80,0.3);
345
- }
346
- [data-testid="stFileUploader"] {
347
- border: 2px dashed #4CAF50;
348
- border-radius: 12px;
349
- background: #f8fff8;
350
- }
351
- </style>
352
- """, unsafe_allow_html=True)
353
-
354
- # Session state initialization
355
  if 'chat_history' not in st.session_state:
356
  st.session_state.chat_history = []
357
  if 'current_file' not in st.session_state:
358
  st.session_state.current_file = None
359
 
360
- # Main UI
361
- st.title("πŸ“„ Academic PDF Analyzer")
362
  st.markdown("""
363
- <div style="border-left: 4px solid #4CAF50; padding-left: 1.5rem; margin: 2rem 0;">
364
- <p style="color: #2c3e50; font-size: 1.1rem;">πŸ” Upload research papers to:
365
- <ul style="color: #2c3e50; font-size: 1rem;">
366
- <li>Generate citations-backed summaries</li>
367
- <li>Trace claims to original sources</li>
368
- <li>Extract data tables and figures</li>
369
- <li>Q&A with verifiable references</li>
370
  </ul>
371
  </p>
372
  </div>
373
  """, unsafe_allow_html=True)
374
 
375
- # File uploader
376
  uploaded_file = st.file_uploader(
377
- "Upload research PDF",
378
  type="pdf",
379
- help="Maximum file size: 50MB",
380
  on_change=lambda: setattr(st.session_state, 'chat_history', [])
381
  )
382
 
@@ -384,13 +220,11 @@ if uploaded_file and uploaded_file.size > MAX_FILE_SIZE:
384
  st.error("File size exceeds 50MB limit")
385
  st.stop()
386
 
387
- # Document processing
388
  if uploaded_file:
389
  file_path = tempfile.NamedTemporaryFile(delete=False).name
390
  with open(file_path, "wb") as f:
391
  f.write(uploaded_file.getbuffer())
392
 
393
- # Chat interface
394
  chat_container = st.container()
395
  with chat_container:
396
  for idx, chat in enumerate(st.session_state.chat_history):
@@ -403,28 +237,80 @@ if uploaded_file:
403
  message(chat["bot"], key=f"bot_{idx}", allow_html=True)
404
  scroll_to_bottom()
405
 
406
- # Interaction controls
407
  with st.container():
408
  col1, col2, col3 = st.columns([3, 2, 2])
409
  with col1:
410
- user_input = st.chat_input("Ask a research question...")
411
  with col2:
412
- if st.button("πŸ“„ Generate Summary", use_container_width=True):
413
  with st.spinner("Analyzing document structure..."):
 
414
  summary = summarize_pdf(file_path)
415
  st.session_state.chat_history.append({
416
- "bot": f"## Research Summary\n{summary}"
 
417
  })
418
  st.rerun()
419
  with col3:
420
- if st.button("πŸ”„ Clear Session", use_container_width=True):
421
- st.session_state.chat_history = []
422
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
- # Handle user questions
425
  if user_input:
426
  st.session_state.chat_history.append({"user": user_input})
427
- with st.spinner("Verifying sources..."):
 
428
  answer = qa_pdf(file_path, user_input)
429
- st.session_state.chat_history[-1]["bot"] = f"## Research Answer\n{answer}"
430
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  st.rerun()
54
  return wrapper
55
 
56
+ def show_progress(message):
57
+ progress_bar = st.progress(0)
58
+ status_text = st.empty()
59
+ for i in range(100):
60
+ time.sleep(0.02)
61
+ progress_bar.progress(i + 1)
62
+ status_text.text(f"{message}... {i+1}%")
63
+ progress_bar.empty()
64
+ status_text.empty()
65
+
66
  def scroll_to_bottom():
67
  ctx = get_script_run_ctx()
68
  if ctx and runtime.exists():
 
83
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
84
  llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  prompt = ChatPromptTemplate.from_template(
87
+ """Generate a comprehensive summary with these elements:
 
88
  1. Key findings and conclusions
89
  2. Main methodologies used
90
  3. Important data points
91
  4. Limitations mentioned
92
+ Context: {topic}"""
 
 
 
 
 
93
  )
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  loader = PyMuPDFLoader(_pdf_file_path)
96
  docs = loader.load()
97
+ full_text = "\n".join(doc.page_content for doc in docs)
98
+ cleaned_full_text = clean_text(remove_references(full_text))
99
 
 
100
  text_splitter = SpacyTextSplitter(chunk_size=500)
101
+ split_contents = text_splitter.split_text(cleaned_full_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ embeddings = embeddings_model.embed_documents(split_contents)
104
+ kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
105
+ closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
106
+ for center in kmeans.cluster_centers_]
 
 
 
 
 
 
 
 
 
107
 
108
  chain = prompt | llm | StrOutputParser()
109
+ return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
 
 
 
 
 
 
110
 
 
 
 
111
  @st.cache_data(show_spinner=False, ttl=3600)
112
  @handle_errors
113
  def qa_pdf(_pdf_file_path, query, num_clusters=5):
 
183
  img.save(buffered, format="JPEG", quality=85)
184
  return base64.b64encode(buffered.getvalue()).decode()
185
 
186
+ # Streamlit UI
187
  st.set_page_config(
188
+ page_title="PDF Assistant",
189
  page_icon="πŸ“„",
190
  layout="wide",
191
  initial_sidebar_state="expanded"
192
  )
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  if 'chat_history' not in st.session_state:
195
  st.session_state.chat_history = []
196
  if 'current_file' not in st.session_state:
197
  st.session_state.current_file = None
198
 
199
+ st.title("πŸ“„ Smart PDF Analyzer")
 
200
  st.markdown("""
201
+ <div style="border-left: 4px solid #4CAF50; padding-left: 1rem; margin: 1rem 0;">
202
+ <p style="color: #666; font-size: 0.95rem;">✨ Upload a PDF to:
203
+ <ul style="color: #666; font-size: 0.95rem;">
204
+ <li>Generate structured summaries</li>
205
+ <li>Extract visual content</li>
206
+ <li>Ask contextual questions</li>
 
207
  </ul>
208
  </p>
209
  </div>
210
  """, unsafe_allow_html=True)
211
 
 
212
  uploaded_file = st.file_uploader(
213
+ "Choose PDF file",
214
  type="pdf",
215
+ help="Max file size: 50MB",
216
  on_change=lambda: setattr(st.session_state, 'chat_history', [])
217
  )
218
 
 
220
  st.error("File size exceeds 50MB limit")
221
  st.stop()
222
 
 
223
  if uploaded_file:
224
  file_path = tempfile.NamedTemporaryFile(delete=False).name
225
  with open(file_path, "wb") as f:
226
  f.write(uploaded_file.getbuffer())
227
 
 
228
  chat_container = st.container()
229
  with chat_container:
230
  for idx, chat in enumerate(st.session_state.chat_history):
 
237
  message(chat["bot"], key=f"bot_{idx}", allow_html=True)
238
  scroll_to_bottom()
239
 
 
240
  with st.container():
241
  col1, col2, col3 = st.columns([3, 2, 2])
242
  with col1:
243
+ user_input = st.chat_input("Ask about the document...")
244
  with col2:
245
+ if st.button("πŸ“ Generate Summary", use_container_width=True):
246
  with st.spinner("Analyzing document structure..."):
247
+ show_progress("Generating summary")
248
  summary = summarize_pdf(file_path)
249
  st.session_state.chat_history.append({
250
+ "user": "Summary request",
251
+ "bot": f"## Document Summary\n{summary}"
252
  })
253
  st.rerun()
254
  with col3:
255
+ if st.button("πŸ–ΌοΈ Extract Visuals", use_container_width=True):
256
+ with st.spinner("Identifying figures and tables..."):
257
+ show_progress("Extracting visuals")
258
+ figures, tables = process_pdf(file_path)
259
+ if figures:
260
+ st.session_state.chat_history.append({
261
+ "bot": f"Found {len(figures)} figures:"
262
+ })
263
+ for fig in figures:
264
+ st.session_state.chat_history.append({
265
+ "bot": f'<img src="data:image/jpeg;base64,{image_to_base64(fig)}" style="max-width: 100%;">'
266
+ })
267
+ if tables:
268
+ st.session_state.chat_history.append({
269
+ "bot": f"Found {len(tables)} tables:"
270
+ })
271
+ for tab in tables:
272
+ st.session_state.chat_history.append({
273
+ "bot": f'<img src="data:image/jpeg;base64,{image_to_base64(tab)}" style="max-width: 100%;">'
274
+ })
275
+ st.rerun()
276
 
 
277
  if user_input:
278
  st.session_state.chat_history.append({"user": user_input})
279
+ with st.spinner("Analyzing query..."):
280
+ show_progress("Generating answer")
281
  answer = qa_pdf(file_path, user_input)
282
+ st.session_state.chat_history[-1]["bot"] = f"## Answer\n{answer}"
283
+ st.rerun()
284
+
285
+ st.markdown("""
286
+ <style>
287
+ .stChatMessage {
288
+ padding: 1.25rem;
289
+ margin: 1rem 0;
290
+ border-radius: 12px;
291
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
292
+ transition: transform 0.2s ease;
293
+ }
294
+ .stChatMessage:hover {
295
+ transform: translateY(-2px);
296
+ }
297
+ .stButton>button {
298
+ background: linear-gradient(45deg, #4CAF50, #45a049);
299
+ color: white;
300
+ border: none;
301
+ border-radius: 8px;
302
+ padding: 12px 24px;
303
+ font-size: 16px;
304
+ transition: all 0.3s ease;
305
+ }
306
+ .stButton>button:hover {
307
+ box-shadow: 0 4px 12px rgba(76,175,80,0.3);
308
+ transform: translateY(-1px);
309
+ }
310
+ [data-testid="stFileUploader"] {
311
+ border: 2px dashed #4CAF50;
312
+ border-radius: 12px;
313
+ padding: 2rem;
314
+ }
315
+ </style>
316
+ """, unsafe_allow_html=True)