zliang commited on
Commit
f840bdc
Β·
verified Β·
1 Parent(s): c6a9f47

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -156
app.py CHANGED
@@ -1,5 +1,3 @@
1
-
2
-
3
  import os
4
  import time
5
  import io
@@ -55,16 +53,6 @@ def handle_errors(func):
55
  st.rerun()
56
  return wrapper
57
 
58
- def show_progress(message):
59
- progress_bar = st.progress(0)
60
- status_text = st.empty()
61
- for i in range(100):
62
- time.sleep(0.02)
63
- progress_bar.progress(i + 1)
64
- status_text.text(f"{message}... {i+1}%")
65
- progress_bar.empty()
66
- status_text.empty()
67
-
68
  def scroll_to_bottom():
69
  ctx = get_script_run_ctx()
70
  if ctx and runtime.exists():
@@ -85,30 +73,114 @@ def summarize_pdf(_pdf_file_path, num_clusters=10):
85
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
86
  llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  prompt = ChatPromptTemplate.from_template(
89
- """Generate a comprehensive summary with these elements:
 
90
  1. Key findings and conclusions
91
  2. Main methodologies used
92
  3. Important data points
93
  4. Limitations mentioned
94
- Context: {topic}"""
 
 
 
 
 
95
  )
96
 
97
- loader = PyMuPDFLoader(_pdf_file_path)
98
- docs = loader.load()
99
- full_text = "\n".join(doc.page_content for doc in docs)
100
- cleaned_full_text = clean_text(remove_references(full_text))
 
101
 
102
- text_splitter = SpacyTextSplitter(chunk_size=500)
103
- split_contents = text_splitter.split_text(cleaned_full_text)
 
 
 
 
104
 
105
- embeddings = embeddings_model.embed_documents(split_contents)
106
- kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
107
- closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
108
- for center in kmeans.cluster_centers_]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- chain = prompt | llm | StrOutputParser()
111
- return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  @st.cache_data(show_spinner=False, ttl=3600)
114
  @handle_errors
@@ -116,105 +188,121 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
116
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
117
  llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
118
 
119
- prompt = ChatPromptTemplate.from_template(
120
- """Answer this question: {question}
121
- Using only this context: {context}
122
- Format your answer with:
123
- - Clear section headings
124
- - Bullet points for lists
125
- - Bold key terms
126
- - Citations from the text"""
127
- )
128
-
129
  loader = PyMuPDFLoader(_pdf_file_path)
130
  docs = loader.load()
131
- full_text = "\n".join(doc.page_content for doc in docs)
132
- cleaned_full_text = clean_text(remove_references(full_text))
133
 
 
134
  text_splitter = SpacyTextSplitter(chunk_size=500)
135
- split_contents = text_splitter.split_text(cleaned_full_text)
 
 
 
 
 
 
 
136
 
 
 
137
  query_embedding = embeddings_model.embed_query(query)
138
- similarities = cosine_similarity([query_embedding],
139
- embeddings_model.embed_documents(split_contents))[0]
140
  top_indices = np.argsort(similarities)[-num_clusters:]
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  chain = prompt | llm | StrOutputParser()
143
- return chain.invoke({
144
  "question": query,
145
- "context": ' '.join([split_contents[i] for i in top_indices])
 
146
  })
147
-
148
- @st.cache_data(show_spinner=False, ttl=3600)
149
- @handle_errors
150
- def process_pdf(_pdf_file_path):
151
- doc = fitz.open(_pdf_file_path)
152
- all_figures, all_tables = [], []
153
- scale_factor = 300 / 50 # High-res to low-res ratio
154
-
155
- for page in doc:
156
- low_res = page.get_pixmap(dpi=50)
157
- low_res_img = np.frombuffer(low_res.samples, dtype=np.uint8).reshape(low_res.height, low_res.width, 3)
158
-
159
- results = model.predict(low_res_img)
160
- boxes = [
161
- (int(box.xyxy[0][0]), int(box.xyxy[0][1]),
162
- int(box.xyxy[0][2]), int(box.xyxy[0][3]), int(box.cls[0]))
163
- for result in results for box in result.boxes
164
- if box.conf[0] > 0.8 and int(box.cls[0]) in {3, 4}
165
- ]
166
-
167
- if boxes:
168
- high_res = page.get_pixmap(dpi=300)
169
- high_res_img = np.frombuffer(high_res.samples, dtype=np.uint8).reshape(high_res.height, high_res.width, 3)
170
-
171
- for (x1, y1, x2, y2, cls) in boxes:
172
- cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
173
- int(x1*scale_factor):int(x2*scale_factor)]
174
- if cls == 4:
175
- all_figures.append(cropped)
176
- else:
177
- all_tables.append(cropped)
178
 
179
- return all_figures, all_tables
180
 
181
- def image_to_base64(img):
182
- buffered = io.BytesIO()
183
- img = Image.fromarray(img).convert("RGB")
184
- img.thumbnail((800, 800)) # Optimize image size
185
- img.save(buffered, format="JPEG", quality=85)
186
- return base64.b64encode(buffered.getvalue()).decode()
187
 
188
- # Streamlit UI
189
  st.set_page_config(
190
- page_title="PDF Assistant",
191
  page_icon="πŸ“„",
192
  layout="wide",
193
  initial_sidebar_state="expanded"
194
  )
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  if 'chat_history' not in st.session_state:
197
  st.session_state.chat_history = []
198
  if 'current_file' not in st.session_state:
199
  st.session_state.current_file = None
200
 
201
- st.title("πŸ“„ Smart PDF Analyzer")
 
202
  st.markdown("""
203
- <div style="border-left: 4px solid #4CAF50; padding-left: 1rem; margin: 1rem 0;">
204
- <p style="color: #666; font-size: 0.95rem;">✨ Upload a PDF to:
205
- <ul style="color: #666; font-size: 0.95rem;">
206
- <li>Generate structured summaries</li>
207
- <li>Extract visual content</li>
208
- <li>Ask contextual questions</li>
 
209
  </ul>
210
  </p>
211
  </div>
212
  """, unsafe_allow_html=True)
213
 
 
214
  uploaded_file = st.file_uploader(
215
- "Choose PDF file",
216
  type="pdf",
217
- help="Max file size: 50MB",
218
  on_change=lambda: setattr(st.session_state, 'chat_history', [])
219
  )
220
 
@@ -222,11 +310,13 @@ if uploaded_file and uploaded_file.size > MAX_FILE_SIZE:
222
  st.error("File size exceeds 50MB limit")
223
  st.stop()
224
 
 
225
  if uploaded_file:
226
  file_path = tempfile.NamedTemporaryFile(delete=False).name
227
  with open(file_path, "wb") as f:
228
- f.write(uploaded_file.getbuffer())
229
 
 
230
  chat_container = st.container()
231
  with chat_container:
232
  for idx, chat in enumerate(st.session_state.chat_history):
@@ -239,80 +329,28 @@ if uploaded_file:
239
  message(chat["bot"], key=f"bot_{idx}", allow_html=True)
240
  scroll_to_bottom()
241
 
 
242
  with st.container():
243
  col1, col2, col3 = st.columns([3, 2, 2])
244
  with col1:
245
- user_input = st.chat_input("Ask about the document...")
246
  with col2:
247
- if st.button("πŸ“ Generate Summary", use_container_width=True):
248
  with st.spinner("Analyzing document structure..."):
249
- show_progress("Generating summary")
250
  summary = summarize_pdf(file_path)
251
  st.session_state.chat_history.append({
252
- "user": "Summary request",
253
- "bot": f"## Document Summary\n{summary}"
254
  })
255
  st.rerun()
256
  with col3:
257
- if st.button("πŸ–ΌοΈ Extract Visuals", use_container_width=True):
258
- with st.spinner("Identifying figures and tables..."):
259
- show_progress("Extracting visuals")
260
- figures, tables = process_pdf(file_path)
261
- if figures:
262
- st.session_state.chat_history.append({
263
- "bot": f"Found {len(figures)} figures:"
264
- })
265
- for fig in figures:
266
- st.session_state.chat_history.append({
267
- "bot": f'<img src="data:image/jpeg;base64,{image_to_base64(fig)}" style="max-width: 100%;">'
268
- })
269
- if tables:
270
- st.session_state.chat_history.append({
271
- "bot": f"Found {len(tables)} tables:"
272
- })
273
- for tab in tables:
274
- st.session_state.chat_history.append({
275
- "bot": f'<img src="data:image/jpeg;base64,{image_to_base64(tab)}" style="max-width: 100%;">'
276
- })
277
- st.rerun()
278
 
 
279
  if user_input:
280
  st.session_state.chat_history.append({"user": user_input})
281
- with st.spinner("Analyzing query..."):
282
- show_progress("Generating answer")
283
  answer = qa_pdf(file_path, user_input)
284
- st.session_state.chat_history[-1]["bot"] = f"## Answer\n{answer}"
285
- st.rerun()
286
-
287
- st.markdown("""
288
- <style>
289
- .stChatMessage {
290
- padding: 1.25rem;
291
- margin: 1rem 0;
292
- border-radius: 12px;
293
- box-shadow: 0 2px 8px rgba(0,0,0,0.1);
294
- transition: transform 0.2s ease;
295
- }
296
- .stChatMessage:hover {
297
- transform: translateY(-2px);
298
- }
299
- .stButton>button {
300
- background: linear-gradient(45deg, #4CAF50, #45a049);
301
- color: white;
302
- border: none;
303
- border-radius: 8px;
304
- padding: 12px 24px;
305
- font-size: 16px;
306
- transition: all 0.3s ease;
307
- }
308
- .stButton>button:hover {
309
- box-shadow: 0 4px 12px rgba(76,175,80,0.3);
310
- transform: translateY(-1px);
311
- }
312
- [data-testid="stFileUploader"] {
313
- border: 2px dashed #4CAF50;
314
- border-radius: 12px;
315
- padding: 2rem;
316
- }
317
- </style>
318
- """, unsafe_allow_html=True)
 
 
 
1
  import os
2
  import time
3
  import io
 
53
  st.rerun()
54
  return wrapper
55
 
 
 
 
 
 
 
 
 
 
 
56
  def scroll_to_bottom():
57
  ctx = get_script_run_ctx()
58
  if ctx and runtime.exists():
 
73
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
74
  llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
75
 
76
+ # Load PDF with page numbers
77
+ loader = PyMuPDFLoader(_pdf_file_path)
78
+ docs = loader.load()
79
+
80
+ # Create chunks with page metadata
81
+ text_splitter = SpacyTextSplitter(chunk_size=500)
82
+ chunks_with_metadata = []
83
+ for doc in docs:
84
+ chunks = text_splitter.split_text(doc.page_content)
85
+ for chunk in chunks:
86
+ chunks_with_metadata.append({
87
+ "text": clean_text(chunk),
88
+ "page": doc.metadata["page"] + 1 # Convert to 1-based numbering
89
+ })
90
+
91
+ # Prepare prompt with citation instructions
92
  prompt = ChatPromptTemplate.from_template(
93
+ """Generate a comprehensive summary with inline citations using [Source X] format.
94
+ Include these elements:
95
  1. Key findings and conclusions
96
  2. Main methodologies used
97
  3. Important data points
98
  4. Limitations mentioned
99
+
100
+ Structure your response as:
101
+ ## Comprehensive Summary
102
+ {summary_content}
103
+
104
+ Contexts: {topic}"""
105
  )
106
 
107
+ # Generate summary
108
+ chain = prompt | llm | StrOutputParser()
109
+ raw_summary = chain.invoke({
110
+ "topic": ' '.join([chunk["text"] for chunk in chunks_with_metadata])
111
+ })
112
 
113
+ return generate_interactive_citations(raw_summary, chunks_with_metadata)
114
+
115
+ def generate_interactive_citations(summary_text, source_chunks):
116
+ # Create source entries with page numbers and full text
117
+ sources_html = """<div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e0e0e0;">
118
+ <h3 style="color: #2c3e50;">πŸ“– Source References</h3>"""
119
 
120
+ source_mapping = {}
121
+ for idx, chunk in enumerate(source_chunks):
122
+ source_id = f"source-{idx+1}"
123
+ source_mapping[idx+1] = {
124
+ "id": source_id,
125
+ "page": chunk["page"],
126
+ "text": chunk["text"]
127
+ }
128
+
129
+ sources_html += f"""
130
+ <div id="{source_id}" style="margin: 1rem 0; padding: 1rem;
131
+ border: 1px solid #e0e0e0; border-radius: 8px;
132
+ background-color: #f8f9fa; transition: all 0.3s ease;">
133
+ <div style="display: flex; justify-content: space-between; align-items: center;">
134
+ <div style="font-weight: 600; color: #4CAF50;">Source {idx+1}</div>
135
+ <div style="font-size: 0.9em; color: #666;">Page {chunk['page']}</div>
136
+ </div>
137
+ <div style="margin-top: 0.5rem; color: #444; font-size: 0.95em;">
138
+ {chunk["text"]}
139
+ </div>
140
+ </div>
141
+ """
142
 
143
+ sources_html += "</div>"
144
+
145
+ # Add click interactions
146
+ interaction_js = """
147
+ <script>
148
+ document.querySelectorAll('.citation-link').forEach(item => {
149
+ item.addEventListener('click', function(e) {
150
+ e.preventDefault();
151
+ const sourceId = this.getAttribute('data-source');
152
+ const sourceDiv = document.getElementById(sourceId);
153
+
154
+ // Highlight animation
155
+ sourceDiv.style.transform = 'scale(1.02)';
156
+ sourceDiv.style.boxShadow = '0 4px 12px rgba(76,175,80,0.2)';
157
+
158
+ setTimeout(() => {
159
+ sourceDiv.style.transform = 'none';
160
+ sourceDiv.style.boxShadow = 'none';
161
+ }, 500);
162
+
163
+ // Smooth scroll
164
+ sourceDiv.scrollIntoView({behavior: 'smooth', block: 'start'});
165
+ });
166
+ });
167
+ </script>
168
+ """
169
+
170
+ # Replace citations with interactive links
171
+ cited_summary = re.sub(r'\[Source (\d+)\]',
172
+ lambda m: f'<a class="citation-link" data-source="source-{m.group(1)}" '
173
+ f'style="cursor: pointer; color: #4CAF50; text-decoration: none; '
174
+ f'border-bottom: 1px dashed #4CAF50;">[Source {m.group(1)}]</a>',
175
+ summary_text)
176
+
177
+ return f"""
178
+ <div style="margin-bottom: 3rem;">
179
+ {cited_summary}
180
+ {sources_html}
181
+ </div>
182
+ {interaction_js}
183
+ """
184
 
185
  @st.cache_data(show_spinner=False, ttl=3600)
186
  @handle_errors
 
188
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
189
  llm = ChatOpenAI(model="gpt-4", api_key=openai_api_key, temperature=0.3)
190
 
191
+ # Load PDF with page numbers
 
 
 
 
 
 
 
 
 
192
  loader = PyMuPDFLoader(_pdf_file_path)
193
  docs = loader.load()
 
 
194
 
195
+ # Create chunks with page metadata
196
  text_splitter = SpacyTextSplitter(chunk_size=500)
197
+ chunks_with_metadata = []
198
+ for doc in docs:
199
+ chunks = text_splitter.split_text(doc.page_content)
200
+ for chunk in chunks:
201
+ chunks_with_metadata.append({
202
+ "text": clean_text(chunk),
203
+ "page": doc.metadata["page"] + 1
204
+ })
205
 
206
+ # Find relevant chunks
207
+ embeddings = embeddings_model.embed_documents([chunk["text"] for chunk in chunks_with_metadata])
208
  query_embedding = embeddings_model.embed_query(query)
209
+ similarities = cosine_similarity([query_embedding], embeddings)[0]
 
210
  top_indices = np.argsort(similarities)[-num_clusters:]
211
 
212
+ # Prepare prompt with citation instructions
213
+ prompt = ChatPromptTemplate.from_template(
214
+ """Answer this question with inline citations using [Source X] format:
215
+ {question}
216
+
217
+ Use these verified sources:
218
+ {context}
219
+
220
+ Structure your answer with:
221
+ - Clear section headings
222
+ - Bullet points for lists
223
+ - Citations for all factual claims"""
224
+ )
225
+
226
  chain = prompt | llm | StrOutputParser()
227
+ raw_answer = chain.invoke({
228
  "question": query,
229
+ "context": '\n\n'.join([f"Source {i+1} (Page {chunks_with_metadata[i]['page']}): {chunks_with_metadata[i]['text']}"
230
+ for i in top_indices])
231
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ return generate_interactive_citations(raw_answer, [chunks_with_metadata[i] for i in top_indices])
234
 
235
+ # (Keep the rest of the code from previous implementation for PDF processing and UI)
236
+ # [Include the process_pdf, image_to_base64, and Streamlit UI code from previous response]
237
+ # [Make sure to maintain all the UI improvements and error handling]
 
 
 
238
 
239
+ # Streamlit UI Configuration
240
  st.set_page_config(
241
+ page_title="PDF Research Assistant",
242
  page_icon="πŸ“„",
243
  layout="wide",
244
  initial_sidebar_state="expanded"
245
  )
246
 
247
+ # Custom CSS Styles
248
+ st.markdown("""
249
+ <style>
250
+ .citation-link {
251
+ transition: all 0.2s ease;
252
+ font-weight: 500;
253
+ }
254
+ .citation-link:hover {
255
+ color: #45a049 !important;
256
+ border-bottom-color: #45a049 !important;
257
+ }
258
+ .stChatMessage {
259
+ border-radius: 12px;
260
+ box-shadow: 0 4px 12px rgba(0,0,0,0.08);
261
+ margin: 1.5rem 0;
262
+ padding: 1.5rem;
263
+ }
264
+ .stButton>button {
265
+ background: linear-gradient(135deg, #4CAF50, #45a049);
266
+ transition: transform 0.2s ease, box-shadow 0.2s ease;
267
+ }
268
+ .stButton>button:hover {
269
+ transform: translateY(-1px);
270
+ box-shadow: 0 4px 12px rgba(76,175,80,0.3);
271
+ }
272
+ [data-testid="stFileUploader"] {
273
+ border: 2px dashed #4CAF50;
274
+ border-radius: 12px;
275
+ background: #f8fff8;
276
+ }
277
+ </style>
278
+ """, unsafe_allow_html=True)
279
+
280
+ # Session state initialization
281
  if 'chat_history' not in st.session_state:
282
  st.session_state.chat_history = []
283
  if 'current_file' not in st.session_state:
284
  st.session_state.current_file = None
285
 
286
+ # Main UI
287
+ st.title("πŸ“„ Academic PDF Analyzer")
288
  st.markdown("""
289
+ <div style="border-left: 4px solid #4CAF50; padding-left: 1.5rem; margin: 2rem 0;">
290
+ <p style="color: #2c3e50; font-size: 1.1rem;">πŸ” Upload research papers to:
291
+ <ul style="color: #2c3e50; font-size: 1rem;">
292
+ <li>Generate citations-backed summaries</li>
293
+ <li>Trace claims to original sources</li>
294
+ <li>Extract data tables and figures</li>
295
+ <li>Q&A with verifiable references</li>
296
  </ul>
297
  </p>
298
  </div>
299
  """, unsafe_allow_html=True)
300
 
301
+ # File uploader
302
  uploaded_file = st.file_uploader(
303
+ "Upload research PDF",
304
  type="pdf",
305
+ help="Maximum file size: 50MB",
306
  on_change=lambda: setattr(st.session_state, 'chat_history', [])
307
  )
308
 
 
310
  st.error("File size exceeds 50MB limit")
311
  st.stop()
312
 
313
+ # Document processing
314
  if uploaded_file:
315
  file_path = tempfile.NamedTemporaryFile(delete=False).name
316
  with open(file_path, "wb") as f:
317
+ f.write(uploaded_file.getbuffer()
318
 
319
+ # Chat interface
320
  chat_container = st.container()
321
  with chat_container:
322
  for idx, chat in enumerate(st.session_state.chat_history):
 
329
  message(chat["bot"], key=f"bot_{idx}", allow_html=True)
330
  scroll_to_bottom()
331
 
332
+ # Interaction controls
333
  with st.container():
334
  col1, col2, col3 = st.columns([3, 2, 2])
335
  with col1:
336
+ user_input = st.chat_input("Ask a research question...")
337
  with col2:
338
+ if st.button("πŸ“„ Generate Summary", use_container_width=True):
339
  with st.spinner("Analyzing document structure..."):
 
340
  summary = summarize_pdf(file_path)
341
  st.session_state.chat_history.append({
342
+ "bot": f"## Research Summary\n{summary}"
 
343
  })
344
  st.rerun()
345
  with col3:
346
+ if st.button("πŸ”„ Clear Session", use_container_width=True):
347
+ st.session_state.chat_history = []
348
+ st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
+ # Handle user questions
351
  if user_input:
352
  st.session_state.chat_history.append({"user": user_input})
353
+ with st.spinner("Verifying sources..."):
 
354
  answer = qa_pdf(file_path, user_input)
355
+ st.session_state.chat_history[-1]["bot"] = f"## Research Answer\n{answer}"
356
+ st.rerun()