zliang commited on
Commit
3d8d27c
·
verified ·
1 Parent(s): 3a16e8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -66
app.py CHANGED
@@ -81,26 +81,33 @@ def scroll_to_bottom():
81
  # ----------------------------
82
  @st.cache_data(show_spinner=False, ttl=3600)
83
  @handle_errors
84
- def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
 
 
 
85
  """
86
- Generates a summary that includes in-text citations based on selected context chunks.
87
- Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
88
- After the summary, a reference list is provided mapping each citation number to the full original text excerpt.
89
  """
90
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
91
  llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
92
 
93
- # Updated prompt instructs the LLM to use the full excerpt in the reference list.
94
  prompt = ChatPromptTemplate.from_template(
95
- """Generate a comprehensive summary with the following elements:
96
  1. Key findings and conclusions
97
  2. Main methodologies used
98
  3. Important data points
99
  4. Limitations mentioned
100
 
101
- For any information that is directly derived from the provided context excerpts, insert an in-text citation in the format [n] where n corresponds to the excerpt number.
 
 
 
 
 
 
102
 
103
- After the summary, please provide a reference list where each citation number is mapped to the full original text excerpt as provided below. Do not simply echo the citation number; include the complete excerpt text.
104
 
105
  Context Excerpts:
106
  {contexts}"""
@@ -123,11 +130,12 @@ Context Excerpts:
123
  idx = int(np.argmin(distances))
124
  citation_indices.append(idx)
125
 
126
- # Create a context string with citations including the full original text excerpts
127
  citation_contexts = []
128
  for i, idx in enumerate(citation_indices):
129
- # Using the full excerpt from split_contents for the reference list.
130
- citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
 
131
  combined_contexts = "\n\n".join(citation_contexts)
132
 
133
  chain = prompt | llm | StrOutputParser()
@@ -136,57 +144,6 @@ Context Excerpts:
136
 
137
 
138
 
139
- @st.cache_data(show_spinner=False, ttl=3600)
140
- @handle_errors
141
- def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
142
- """
143
- Generates a summary that includes in-text citations based on selected context chunks.
144
- Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
145
- After the summary, a reference list is provided.
146
- """
147
- embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
148
- llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
149
-
150
- prompt = ChatPromptTemplate.from_template(
151
- """Generate a comprehensive summary with the following elements:
152
- 1. Key findings and conclusions
153
- 2. Main methodologies used
154
- 3. Important data points
155
- 4. Limitations mentioned
156
-
157
- In your summary, include in-text citations formatted as [1], [2], etc., that refer to the source contexts provided below.
158
- After the summary, provide a reference list mapping each citation number to its corresponding context excerpt.
159
-
160
- Contexts:
161
- {contexts}"""
162
- )
163
-
164
- loader = PyMuPDFLoader(_pdf_file_path)
165
- docs = loader.load()
166
- full_text = "\n".join(doc.page_content for doc in docs)
167
- cleaned_full_text = clean_text(remove_references(full_text))
168
-
169
- text_splitter = SpacyTextSplitter(chunk_size=500)
170
- split_contents = text_splitter.split_text(cleaned_full_text)
171
-
172
- embeddings = embeddings_model.embed_documents(split_contents)
173
- kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
174
-
175
- citation_indices = []
176
- for center in kmeans.cluster_centers_:
177
- distances = np.linalg.norm(embeddings - center, axis=1)
178
- idx = int(np.argmin(distances))
179
- citation_indices.append(idx)
180
-
181
- # Create a context string with citations (e.g. "[1]: ...", "[2]: ...")
182
- citation_contexts = []
183
- for i, idx in enumerate(citation_indices):
184
- citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
185
- combined_contexts = "\n\n".join(citation_contexts)
186
-
187
- chain = prompt | llm | StrOutputParser()
188
- result = chain.invoke({"contexts": combined_contexts})
189
- return result
190
 
191
 
192
  @st.cache_data(show_spinner=False, ttl=3600)
@@ -332,15 +289,13 @@ if uploaded_file:
332
  if st.button("📝 Generate Summary", use_container_width=True):
333
  with st.spinner("Analyzing document structure..."):
334
  show_progress("Generating summary")
335
- if include_citations:
336
- summary = summarize_pdf_with_citations(file_path)
337
- else:
338
- summary = summarize_pdf(file_path)
339
  st.session_state.chat_history.append({
340
  "user": "Summary request",
341
  "bot": f"## Document Summary\n{summary}"
342
  })
343
  st.rerun()
 
344
  with col3:
345
  if st.button("🖼️ Extract Visuals", use_container_width=True):
346
  with st.spinner("Identifying figures and tables..."):
@@ -402,5 +357,25 @@ st.markdown("""
402
  border-radius: 12px;
403
  padding: 2rem;
404
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
  </style>
406
  """, unsafe_allow_html=True)
 
81
  # ----------------------------
82
  @st.cache_data(show_spinner=False, ttl=3600)
83
  @handle_errors
84
+
85
+ @st.cache_data(show_spinner=False, ttl=3600)
86
+ @handle_errors
87
+ def summarize_pdf_with_tooltips(_pdf_file_path, num_clusters=10):
88
  """
89
+ Generates a summary with in-text citations that display the full excerpt as a tooltip on hover.
90
+ Each citation is embedded as an HTML span element with the tooltip text.
 
91
  """
92
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
93
  llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
94
 
 
95
  prompt = ChatPromptTemplate.from_template(
96
+ """Generate a comprehensive summary that includes the following:
97
  1. Key findings and conclusions
98
  2. Main methodologies used
99
  3. Important data points
100
  4. Limitations mentioned
101
 
102
+ For any information directly derived from the context excerpts provided below, insert an in-text citation as an HTML tooltip.
103
+ For each citation, use the following HTML format:
104
+ <span class="tooltip" data-tooltip="{full_text}">[{n}]</span>
105
+
106
+ Where:
107
+ - {n} is the citation number.
108
+ - {full_text} is the complete excerpt text for that citation.
109
 
110
+ Do not provide a separate reference list. Instead, embed the full citation text directly in the tooltip.
111
 
112
  Context Excerpts:
113
  {contexts}"""
 
130
  idx = int(np.argmin(distances))
131
  citation_indices.append(idx)
132
 
133
+ # Build the context excerpts string.
134
  citation_contexts = []
135
  for i, idx in enumerate(citation_indices):
136
+ # Replace double quotes to avoid breaking HTML attribute quotes.
137
+ excerpt = split_contents[idx].replace('"', "'")
138
+ citation_contexts.append(f"[{i+1}]: {excerpt}")
139
  combined_contexts = "\n\n".join(citation_contexts)
140
 
141
  chain = prompt | llm | StrOutputParser()
 
144
 
145
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
 
149
  @st.cache_data(show_spinner=False, ttl=3600)
 
289
  if st.button("📝 Generate Summary", use_container_width=True):
290
  with st.spinner("Analyzing document structure..."):
291
  show_progress("Generating summary")
292
+ summary = summarize_pdf_with_tooltips(file_path)
 
 
 
293
  st.session_state.chat_history.append({
294
  "user": "Summary request",
295
  "bot": f"## Document Summary\n{summary}"
296
  })
297
  st.rerun()
298
+
299
  with col3:
300
  if st.button("🖼️ Extract Visuals", use_container_width=True):
301
  with st.spinner("Identifying figures and tables..."):
 
357
  border-radius: 12px;
358
  padding: 2rem;
359
  }
360
+ .tooltip {
361
+ position: relative;
362
+ cursor: pointer;
363
+ border-bottom: 1px dotted #555;
364
+ }
365
+
366
+ /* Tooltip text */
367
+ .tooltip:hover::after {
368
+ content: attr(data-tooltip);
369
+ position: absolute;
370
+ left: 0;
371
+ top: 1.5em;
372
+ background: #333;
373
+ color: #fff;
374
+ padding: 5px 10px;
375
+ border-radius: 5px;
376
+ white-space: pre-wrap;
377
+ z-index: 100;
378
+ width: 300px; /* Adjust width as needed */
379
+ }
380
  </style>
381
  """, unsafe_allow_html=True)