Update app.py
Browse files
app.py
CHANGED
@@ -81,26 +81,33 @@ def scroll_to_bottom():
|
|
81 |
# ----------------------------
|
82 |
@st.cache_data(show_spinner=False, ttl=3600)
|
83 |
@handle_errors
|
84 |
-
|
|
|
|
|
|
|
85 |
"""
|
86 |
-
Generates a summary
|
87 |
-
Each
|
88 |
-
After the summary, a reference list is provided mapping each citation number to the full original text excerpt.
|
89 |
"""
|
90 |
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
|
91 |
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
|
92 |
|
93 |
-
# Updated prompt instructs the LLM to use the full excerpt in the reference list.
|
94 |
prompt = ChatPromptTemplate.from_template(
|
95 |
-
"""Generate a comprehensive summary
|
96 |
1. Key findings and conclusions
|
97 |
2. Main methodologies used
|
98 |
3. Important data points
|
99 |
4. Limitations mentioned
|
100 |
|
101 |
-
For any information
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
|
104 |
|
105 |
Context Excerpts:
|
106 |
{contexts}"""
|
@@ -123,11 +130,12 @@ Context Excerpts:
|
|
123 |
idx = int(np.argmin(distances))
|
124 |
citation_indices.append(idx)
|
125 |
|
126 |
-
#
|
127 |
citation_contexts = []
|
128 |
for i, idx in enumerate(citation_indices):
|
129 |
-
#
|
130 |
-
|
|
|
131 |
combined_contexts = "\n\n".join(citation_contexts)
|
132 |
|
133 |
chain = prompt | llm | StrOutputParser()
|
@@ -136,57 +144,6 @@ Context Excerpts:
|
|
136 |
|
137 |
|
138 |
|
139 |
-
@st.cache_data(show_spinner=False, ttl=3600)
|
140 |
-
@handle_errors
|
141 |
-
def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
|
142 |
-
"""
|
143 |
-
Generates a summary that includes in-text citations based on selected context chunks.
|
144 |
-
Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
|
145 |
-
After the summary, a reference list is provided.
|
146 |
-
"""
|
147 |
-
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
|
148 |
-
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
|
149 |
-
|
150 |
-
prompt = ChatPromptTemplate.from_template(
|
151 |
-
"""Generate a comprehensive summary with the following elements:
|
152 |
-
1. Key findings and conclusions
|
153 |
-
2. Main methodologies used
|
154 |
-
3. Important data points
|
155 |
-
4. Limitations mentioned
|
156 |
-
|
157 |
-
In your summary, include in-text citations formatted as [1], [2], etc., that refer to the source contexts provided below.
|
158 |
-
After the summary, provide a reference list mapping each citation number to its corresponding context excerpt.
|
159 |
-
|
160 |
-
Contexts:
|
161 |
-
{contexts}"""
|
162 |
-
)
|
163 |
-
|
164 |
-
loader = PyMuPDFLoader(_pdf_file_path)
|
165 |
-
docs = loader.load()
|
166 |
-
full_text = "\n".join(doc.page_content for doc in docs)
|
167 |
-
cleaned_full_text = clean_text(remove_references(full_text))
|
168 |
-
|
169 |
-
text_splitter = SpacyTextSplitter(chunk_size=500)
|
170 |
-
split_contents = text_splitter.split_text(cleaned_full_text)
|
171 |
-
|
172 |
-
embeddings = embeddings_model.embed_documents(split_contents)
|
173 |
-
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
|
174 |
-
|
175 |
-
citation_indices = []
|
176 |
-
for center in kmeans.cluster_centers_:
|
177 |
-
distances = np.linalg.norm(embeddings - center, axis=1)
|
178 |
-
idx = int(np.argmin(distances))
|
179 |
-
citation_indices.append(idx)
|
180 |
-
|
181 |
-
# Create a context string with citations (e.g. "[1]: ...", "[2]: ...")
|
182 |
-
citation_contexts = []
|
183 |
-
for i, idx in enumerate(citation_indices):
|
184 |
-
citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
|
185 |
-
combined_contexts = "\n\n".join(citation_contexts)
|
186 |
-
|
187 |
-
chain = prompt | llm | StrOutputParser()
|
188 |
-
result = chain.invoke({"contexts": combined_contexts})
|
189 |
-
return result
|
190 |
|
191 |
|
192 |
@st.cache_data(show_spinner=False, ttl=3600)
|
@@ -332,15 +289,13 @@ if uploaded_file:
|
|
332 |
if st.button("📝 Generate Summary", use_container_width=True):
|
333 |
with st.spinner("Analyzing document structure..."):
|
334 |
show_progress("Generating summary")
|
335 |
-
|
336 |
-
summary = summarize_pdf_with_citations(file_path)
|
337 |
-
else:
|
338 |
-
summary = summarize_pdf(file_path)
|
339 |
st.session_state.chat_history.append({
|
340 |
"user": "Summary request",
|
341 |
"bot": f"## Document Summary\n{summary}"
|
342 |
})
|
343 |
st.rerun()
|
|
|
344 |
with col3:
|
345 |
if st.button("🖼️ Extract Visuals", use_container_width=True):
|
346 |
with st.spinner("Identifying figures and tables..."):
|
@@ -402,5 +357,25 @@ st.markdown("""
|
|
402 |
border-radius: 12px;
|
403 |
padding: 2rem;
|
404 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
</style>
|
406 |
""", unsafe_allow_html=True)
|
|
|
81 |
# ----------------------------
|
82 |
@st.cache_data(show_spinner=False, ttl=3600)
|
83 |
@handle_errors
|
84 |
+
|
85 |
+
@st.cache_data(show_spinner=False, ttl=3600)
|
86 |
+
@handle_errors
|
87 |
+
def summarize_pdf_with_tooltips(_pdf_file_path, num_clusters=10):
|
88 |
"""
|
89 |
+
Generates a summary with in-text citations that display the full excerpt as a tooltip on hover.
|
90 |
+
Each citation is embedded as an HTML span element with the tooltip text.
|
|
|
91 |
"""
|
92 |
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
|
93 |
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
|
94 |
|
|
|
95 |
prompt = ChatPromptTemplate.from_template(
|
96 |
+
"""Generate a comprehensive summary that includes the following:
|
97 |
1. Key findings and conclusions
|
98 |
2. Main methodologies used
|
99 |
3. Important data points
|
100 |
4. Limitations mentioned
|
101 |
|
102 |
+
For any information directly derived from the context excerpts provided below, insert an in-text citation as an HTML tooltip.
|
103 |
+
For each citation, use the following HTML format:
|
104 |
+
<span class="tooltip" data-tooltip="{full_text}">[{n}]</span>
|
105 |
+
|
106 |
+
Where:
|
107 |
+
- {n} is the citation number.
|
108 |
+
- {full_text} is the complete excerpt text for that citation.
|
109 |
|
110 |
+
Do not provide a separate reference list. Instead, embed the full citation text directly in the tooltip.
|
111 |
|
112 |
Context Excerpts:
|
113 |
{contexts}"""
|
|
|
130 |
idx = int(np.argmin(distances))
|
131 |
citation_indices.append(idx)
|
132 |
|
133 |
+
# Build the context excerpts string.
|
134 |
citation_contexts = []
|
135 |
for i, idx in enumerate(citation_indices):
|
136 |
+
# Replace double quotes to avoid breaking HTML attribute quotes.
|
137 |
+
excerpt = split_contents[idx].replace('"', "'")
|
138 |
+
citation_contexts.append(f"[{i+1}]: {excerpt}")
|
139 |
combined_contexts = "\n\n".join(citation_contexts)
|
140 |
|
141 |
chain = prompt | llm | StrOutputParser()
|
|
|
144 |
|
145 |
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
|
149 |
@st.cache_data(show_spinner=False, ttl=3600)
|
|
|
289 |
if st.button("📝 Generate Summary", use_container_width=True):
|
290 |
with st.spinner("Analyzing document structure..."):
|
291 |
show_progress("Generating summary")
|
292 |
+
summary = summarize_pdf_with_tooltips(file_path)
|
|
|
|
|
|
|
293 |
st.session_state.chat_history.append({
|
294 |
"user": "Summary request",
|
295 |
"bot": f"## Document Summary\n{summary}"
|
296 |
})
|
297 |
st.rerun()
|
298 |
+
|
299 |
with col3:
|
300 |
if st.button("🖼️ Extract Visuals", use_container_width=True):
|
301 |
with st.spinner("Identifying figures and tables..."):
|
|
|
357 |
border-radius: 12px;
|
358 |
padding: 2rem;
|
359 |
}
|
360 |
+
.tooltip {
|
361 |
+
position: relative;
|
362 |
+
cursor: pointer;
|
363 |
+
border-bottom: 1px dotted #555;
|
364 |
+
}
|
365 |
+
|
366 |
+
/* Tooltip text */
|
367 |
+
.tooltip:hover::after {
|
368 |
+
content: attr(data-tooltip);
|
369 |
+
position: absolute;
|
370 |
+
left: 0;
|
371 |
+
top: 1.5em;
|
372 |
+
background: #333;
|
373 |
+
color: #fff;
|
374 |
+
padding: 5px 10px;
|
375 |
+
border-radius: 5px;
|
376 |
+
white-space: pre-wrap;
|
377 |
+
z-index: 100;
|
378 |
+
width: 300px; /* Adjust width as needed */
|
379 |
+
}
|
380 |
</style>
|
381 |
""", unsafe_allow_html=True)
|