zliang commited on
Commit
52d159a
Β·
verified Β·
1 Parent(s): f8659b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -19
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import os
3
  import time
4
  import io
@@ -77,20 +76,24 @@ def scroll_to_bottom():
77
  """
78
  st.components.v1.html(js, height=0)
79
 
80
- # Core processing functions
 
 
 
81
  @st.cache_data(show_spinner=False, ttl=3600)
82
  @handle_errors
83
  def summarize_pdf(_pdf_file_path, num_clusters=10):
 
84
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
85
  llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
86
 
87
  prompt = ChatPromptTemplate.from_template(
88
  """Generate a comprehensive summary with these elements:
89
- 1. Key findings and conclusions
90
- 2. Main methodologies used
91
- 3. Important data points
92
- 4. Limitations mentioned
93
- Context: {topic}"""
94
  )
95
 
96
  loader = PyMuPDFLoader(_pdf_file_path)
@@ -104,11 +107,65 @@ def summarize_pdf(_pdf_file_path, num_clusters=10):
104
  embeddings = embeddings_model.embed_documents(split_contents)
105
  kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
106
  closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
107
- for center in kmeans.cluster_centers_]
108
 
109
  chain = prompt | llm | StrOutputParser()
110
  return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  @st.cache_data(show_spinner=False, ttl=3600)
113
  @handle_errors
114
  def qa_pdf(_pdf_file_path, query, num_clusters=5):
@@ -117,12 +174,12 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
117
 
118
  prompt = ChatPromptTemplate.from_template(
119
  """Answer this question: {question}
120
- Using only this context: {context}
121
- Format your answer with:
122
- - Clear section headings
123
- - Bullet points for lists
124
- - Bold key terms
125
- - Citations from the text"""
126
  )
127
 
128
  loader = PyMuPDFLoader(_pdf_file_path)
@@ -135,7 +192,7 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
135
 
136
  query_embedding = embeddings_model.embed_query(query)
137
  similarities = cosine_similarity([query_embedding],
138
- embeddings_model.embed_documents(split_contents))[0]
139
  top_indices = np.argsort(similarities)[-num_clusters:]
140
 
141
  chain = prompt | llm | StrOutputParser()
@@ -144,6 +201,7 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
144
  "context": ' '.join([split_contents[i] for i in top_indices])
145
  })
146
 
 
147
  @st.cache_data(show_spinner=False, ttl=3600)
148
  @handle_errors
149
  def process_pdf(_pdf_file_path):
@@ -169,7 +227,7 @@ def process_pdf(_pdf_file_path):
169
 
170
  for (x1, y1, x2, y2, cls) in boxes:
171
  cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
172
- int(x1*scale_factor):int(x2*scale_factor)]
173
  if cls == 4:
174
  all_figures.append(cropped)
175
  else:
@@ -184,7 +242,9 @@ def image_to_base64(img):
184
  img.save(buffered, format="JPEG", quality=85)
185
  return base64.b64encode(buffered.getvalue()).decode()
186
 
187
- # Streamlit UI
 
 
188
  st.set_page_config(
189
  page_title="PDF Assistant",
190
  page_icon="πŸ“„",
@@ -226,6 +286,9 @@ if uploaded_file:
226
  with open(file_path, "wb") as f:
227
  f.write(uploaded_file.getbuffer())
228
 
 
 
 
229
  chat_container = st.container()
230
  with chat_container:
231
  for idx, chat in enumerate(st.session_state.chat_history):
@@ -246,7 +309,10 @@ if uploaded_file:
246
  if st.button("πŸ“ Generate Summary", use_container_width=True):
247
  with st.spinner("Analyzing document structure..."):
248
  show_progress("Generating summary")
249
- summary = summarize_pdf(file_path)
 
 
 
250
  st.session_state.chat_history.append({
251
  "user": "Summary request",
252
  "bot": f"## Document Summary\n{summary}"
@@ -314,4 +380,4 @@ st.markdown("""
314
  padding: 2rem;
315
  }
316
  </style>
317
- """, unsafe_allow_html=True)
 
 
1
  import os
2
  import time
3
  import io
 
76
  """
77
  st.components.v1.html(js, height=0)
78
 
79
+ # ----------------------------
80
+ # Core Processing Functions
81
+ # ----------------------------
82
+
83
  @st.cache_data(show_spinner=False, ttl=3600)
84
  @handle_errors
85
  def summarize_pdf(_pdf_file_path, num_clusters=10):
86
+ # Basic summarization without citations
87
  embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
88
  llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
89
 
90
  prompt = ChatPromptTemplate.from_template(
91
  """Generate a comprehensive summary with these elements:
92
+ 1. Key findings and conclusions
93
+ 2. Main methodologies used
94
+ 3. Important data points
95
+ 4. Limitations mentioned
96
+ Context: {topic}"""
97
  )
98
 
99
  loader = PyMuPDFLoader(_pdf_file_path)
 
107
  embeddings = embeddings_model.embed_documents(split_contents)
108
  kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
109
  closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
110
+ for center in kmeans.cluster_centers_]
111
 
112
  chain = prompt | llm | StrOutputParser()
113
  return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
114
 
115
+
116
+ @st.cache_data(show_spinner=False, ttl=3600)
117
+ @handle_errors
118
+ def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
119
+ """
120
+ Generates a summary that includes in-text citations based on selected context chunks.
121
+ Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
122
+ After the summary, a reference list is provided.
123
+ """
124
+ embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
125
+ llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
126
+
127
+ prompt = ChatPromptTemplate.from_template(
128
+ """Generate a comprehensive summary with the following elements:
129
+ 1. Key findings and conclusions
130
+ 2. Main methodologies used
131
+ 3. Important data points
132
+ 4. Limitations mentioned
133
+
134
+ In your summary, include in-text citations formatted as [1], [2], etc., that refer to the source contexts provided below.
135
+ After the summary, provide a reference list mapping each citation number to its corresponding context excerpt.
136
+
137
+ Contexts:
138
+ {contexts}"""
139
+ )
140
+
141
+ loader = PyMuPDFLoader(_pdf_file_path)
142
+ docs = loader.load()
143
+ full_text = "\n".join(doc.page_content for doc in docs)
144
+ cleaned_full_text = clean_text(remove_references(full_text))
145
+
146
+ text_splitter = SpacyTextSplitter(chunk_size=500)
147
+ split_contents = text_splitter.split_text(cleaned_full_text)
148
+
149
+ embeddings = embeddings_model.embed_documents(split_contents)
150
+ kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
151
+
152
+ citation_indices = []
153
+ for center in kmeans.cluster_centers_:
154
+ distances = np.linalg.norm(embeddings - center, axis=1)
155
+ idx = int(np.argmin(distances))
156
+ citation_indices.append(idx)
157
+
158
+ # Create a context string with citations (e.g. "[1]: ...", "[2]: ...")
159
+ citation_contexts = []
160
+ for i, idx in enumerate(citation_indices):
161
+ citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
162
+ combined_contexts = "\n\n".join(citation_contexts)
163
+
164
+ chain = prompt | llm | StrOutputParser()
165
+ result = chain.invoke({"contexts": combined_contexts})
166
+ return result
167
+
168
+
169
  @st.cache_data(show_spinner=False, ttl=3600)
170
  @handle_errors
171
  def qa_pdf(_pdf_file_path, query, num_clusters=5):
 
174
 
175
  prompt = ChatPromptTemplate.from_template(
176
  """Answer this question: {question}
177
+ Using only this context: {context}
178
+ Format your answer with:
179
+ - Clear section headings
180
+ - Bullet points for lists
181
+ - **Bold** key terms
182
+ - Citations from the text"""
183
  )
184
 
185
  loader = PyMuPDFLoader(_pdf_file_path)
 
192
 
193
  query_embedding = embeddings_model.embed_query(query)
194
  similarities = cosine_similarity([query_embedding],
195
+ embeddings_model.embed_documents(split_contents))[0]
196
  top_indices = np.argsort(similarities)[-num_clusters:]
197
 
198
  chain = prompt | llm | StrOutputParser()
 
201
  "context": ' '.join([split_contents[i] for i in top_indices])
202
  })
203
 
204
+
205
  @st.cache_data(show_spinner=False, ttl=3600)
206
  @handle_errors
207
  def process_pdf(_pdf_file_path):
 
227
 
228
  for (x1, y1, x2, y2, cls) in boxes:
229
  cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
230
+ int(x1*scale_factor):int(x2*scale_factor)]
231
  if cls == 4:
232
  all_figures.append(cropped)
233
  else:
 
242
  img.save(buffered, format="JPEG", quality=85)
243
  return base64.b64encode(buffered.getvalue()).decode()
244
 
245
+ # ----------------------------
246
+ # Streamlit UI Setup
247
+ # ----------------------------
248
  st.set_page_config(
249
  page_title="PDF Assistant",
250
  page_icon="πŸ“„",
 
286
  with open(file_path, "wb") as f:
287
  f.write(uploaded_file.getbuffer())
288
 
289
+ # Let the user choose whether to include in-text citations in the summary
290
+ include_citations = st.checkbox("Include in-text citations in summary", value=True)
291
+
292
  chat_container = st.container()
293
  with chat_container:
294
  for idx, chat in enumerate(st.session_state.chat_history):
 
309
  if st.button("πŸ“ Generate Summary", use_container_width=True):
310
  with st.spinner("Analyzing document structure..."):
311
  show_progress("Generating summary")
312
+ if include_citations:
313
+ summary = summarize_pdf_with_citations(file_path)
314
+ else:
315
+ summary = summarize_pdf(file_path)
316
  st.session_state.chat_history.append({
317
  "user": "Summary request",
318
  "bot": f"## Document Summary\n{summary}"
 
380
  padding: 2rem;
381
  }
382
  </style>
383
+ """, unsafe_allow_html=True)