Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
import os
|
3 |
import time
|
4 |
import io
|
@@ -77,20 +76,24 @@ def scroll_to_bottom():
|
|
77 |
"""
|
78 |
st.components.v1.html(js, height=0)
|
79 |
|
80 |
-
#
|
|
|
|
|
|
|
81 |
@st.cache_data(show_spinner=False, ttl=3600)
|
82 |
@handle_errors
|
83 |
def summarize_pdf(_pdf_file_path, num_clusters=10):
|
|
|
84 |
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
|
85 |
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
|
86 |
|
87 |
prompt = ChatPromptTemplate.from_template(
|
88 |
"""Generate a comprehensive summary with these elements:
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
)
|
95 |
|
96 |
loader = PyMuPDFLoader(_pdf_file_path)
|
@@ -104,11 +107,65 @@ def summarize_pdf(_pdf_file_path, num_clusters=10):
|
|
104 |
embeddings = embeddings_model.embed_documents(split_contents)
|
105 |
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
|
106 |
closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
|
107 |
-
|
108 |
|
109 |
chain = prompt | llm | StrOutputParser()
|
110 |
return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
|
111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
@st.cache_data(show_spinner=False, ttl=3600)
|
113 |
@handle_errors
|
114 |
def qa_pdf(_pdf_file_path, query, num_clusters=5):
|
@@ -117,12 +174,12 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
|
|
117 |
|
118 |
prompt = ChatPromptTemplate.from_template(
|
119 |
"""Answer this question: {question}
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
)
|
127 |
|
128 |
loader = PyMuPDFLoader(_pdf_file_path)
|
@@ -135,7 +192,7 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
|
|
135 |
|
136 |
query_embedding = embeddings_model.embed_query(query)
|
137 |
similarities = cosine_similarity([query_embedding],
|
138 |
-
|
139 |
top_indices = np.argsort(similarities)[-num_clusters:]
|
140 |
|
141 |
chain = prompt | llm | StrOutputParser()
|
@@ -144,6 +201,7 @@ def qa_pdf(_pdf_file_path, query, num_clusters=5):
|
|
144 |
"context": ' '.join([split_contents[i] for i in top_indices])
|
145 |
})
|
146 |
|
|
|
147 |
@st.cache_data(show_spinner=False, ttl=3600)
|
148 |
@handle_errors
|
149 |
def process_pdf(_pdf_file_path):
|
@@ -169,7 +227,7 @@ def process_pdf(_pdf_file_path):
|
|
169 |
|
170 |
for (x1, y1, x2, y2, cls) in boxes:
|
171 |
cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
|
172 |
-
|
173 |
if cls == 4:
|
174 |
all_figures.append(cropped)
|
175 |
else:
|
@@ -184,7 +242,9 @@ def image_to_base64(img):
|
|
184 |
img.save(buffered, format="JPEG", quality=85)
|
185 |
return base64.b64encode(buffered.getvalue()).decode()
|
186 |
|
187 |
-
#
|
|
|
|
|
188 |
st.set_page_config(
|
189 |
page_title="PDF Assistant",
|
190 |
page_icon="π",
|
@@ -226,6 +286,9 @@ if uploaded_file:
|
|
226 |
with open(file_path, "wb") as f:
|
227 |
f.write(uploaded_file.getbuffer())
|
228 |
|
|
|
|
|
|
|
229 |
chat_container = st.container()
|
230 |
with chat_container:
|
231 |
for idx, chat in enumerate(st.session_state.chat_history):
|
@@ -246,7 +309,10 @@ if uploaded_file:
|
|
246 |
if st.button("π Generate Summary", use_container_width=True):
|
247 |
with st.spinner("Analyzing document structure..."):
|
248 |
show_progress("Generating summary")
|
249 |
-
|
|
|
|
|
|
|
250 |
st.session_state.chat_history.append({
|
251 |
"user": "Summary request",
|
252 |
"bot": f"## Document Summary\n{summary}"
|
@@ -314,4 +380,4 @@ st.markdown("""
|
|
314 |
padding: 2rem;
|
315 |
}
|
316 |
</style>
|
317 |
-
""", unsafe_allow_html=True)
|
|
|
|
|
1 |
import os
|
2 |
import time
|
3 |
import io
|
|
|
76 |
"""
|
77 |
st.components.v1.html(js, height=0)
|
78 |
|
79 |
+
# ----------------------------
|
80 |
+
# Core Processing Functions
|
81 |
+
# ----------------------------
|
82 |
+
|
83 |
@st.cache_data(show_spinner=False, ttl=3600)
|
84 |
@handle_errors
|
85 |
def summarize_pdf(_pdf_file_path, num_clusters=10):
|
86 |
+
# Basic summarization without citations
|
87 |
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
|
88 |
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
|
89 |
|
90 |
prompt = ChatPromptTemplate.from_template(
|
91 |
"""Generate a comprehensive summary with these elements:
|
92 |
+
1. Key findings and conclusions
|
93 |
+
2. Main methodologies used
|
94 |
+
3. Important data points
|
95 |
+
4. Limitations mentioned
|
96 |
+
Context: {topic}"""
|
97 |
)
|
98 |
|
99 |
loader = PyMuPDFLoader(_pdf_file_path)
|
|
|
107 |
embeddings = embeddings_model.embed_documents(split_contents)
|
108 |
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
|
109 |
closest_indices = [np.argmin(np.linalg.norm(embeddings - center, axis=1))
|
110 |
+
for center in kmeans.cluster_centers_]
|
111 |
|
112 |
chain = prompt | llm | StrOutputParser()
|
113 |
return chain.invoke({"topic": ' '.join([split_contents[idx] for idx in closest_indices])})
|
114 |
|
115 |
+
|
116 |
+
@st.cache_data(show_spinner=False, ttl=3600)
|
117 |
+
@handle_errors
|
118 |
+
def summarize_pdf_with_citations(_pdf_file_path, num_clusters=10):
|
119 |
+
"""
|
120 |
+
Generates a summary that includes in-text citations based on selected context chunks.
|
121 |
+
Each context chunk is numbered (e.g. [1], [2], etc.) and is referenced in the summary.
|
122 |
+
After the summary, a reference list is provided.
|
123 |
+
"""
|
124 |
+
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small", api_key=openai_api_key)
|
125 |
+
llm = ChatOpenAI(model="gpt-3.5-turbo", api_key=openai_api_key, temperature=0.3)
|
126 |
+
|
127 |
+
prompt = ChatPromptTemplate.from_template(
|
128 |
+
"""Generate a comprehensive summary with the following elements:
|
129 |
+
1. Key findings and conclusions
|
130 |
+
2. Main methodologies used
|
131 |
+
3. Important data points
|
132 |
+
4. Limitations mentioned
|
133 |
+
|
134 |
+
In your summary, include in-text citations formatted as [1], [2], etc., that refer to the source contexts provided below.
|
135 |
+
After the summary, provide a reference list mapping each citation number to its corresponding context excerpt.
|
136 |
+
|
137 |
+
Contexts:
|
138 |
+
{contexts}"""
|
139 |
+
)
|
140 |
+
|
141 |
+
loader = PyMuPDFLoader(_pdf_file_path)
|
142 |
+
docs = loader.load()
|
143 |
+
full_text = "\n".join(doc.page_content for doc in docs)
|
144 |
+
cleaned_full_text = clean_text(remove_references(full_text))
|
145 |
+
|
146 |
+
text_splitter = SpacyTextSplitter(chunk_size=500)
|
147 |
+
split_contents = text_splitter.split_text(cleaned_full_text)
|
148 |
+
|
149 |
+
embeddings = embeddings_model.embed_documents(split_contents)
|
150 |
+
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)
|
151 |
+
|
152 |
+
citation_indices = []
|
153 |
+
for center in kmeans.cluster_centers_:
|
154 |
+
distances = np.linalg.norm(embeddings - center, axis=1)
|
155 |
+
idx = int(np.argmin(distances))
|
156 |
+
citation_indices.append(idx)
|
157 |
+
|
158 |
+
# Create a context string with citations (e.g. "[1]: ...", "[2]: ...")
|
159 |
+
citation_contexts = []
|
160 |
+
for i, idx in enumerate(citation_indices):
|
161 |
+
citation_contexts.append(f"[{i+1}]: {split_contents[idx]}")
|
162 |
+
combined_contexts = "\n\n".join(citation_contexts)
|
163 |
+
|
164 |
+
chain = prompt | llm | StrOutputParser()
|
165 |
+
result = chain.invoke({"contexts": combined_contexts})
|
166 |
+
return result
|
167 |
+
|
168 |
+
|
169 |
@st.cache_data(show_spinner=False, ttl=3600)
|
170 |
@handle_errors
|
171 |
def qa_pdf(_pdf_file_path, query, num_clusters=5):
|
|
|
174 |
|
175 |
prompt = ChatPromptTemplate.from_template(
|
176 |
"""Answer this question: {question}
|
177 |
+
Using only this context: {context}
|
178 |
+
Format your answer with:
|
179 |
+
- Clear section headings
|
180 |
+
- Bullet points for lists
|
181 |
+
- **Bold** key terms
|
182 |
+
- Citations from the text"""
|
183 |
)
|
184 |
|
185 |
loader = PyMuPDFLoader(_pdf_file_path)
|
|
|
192 |
|
193 |
query_embedding = embeddings_model.embed_query(query)
|
194 |
similarities = cosine_similarity([query_embedding],
|
195 |
+
embeddings_model.embed_documents(split_contents))[0]
|
196 |
top_indices = np.argsort(similarities)[-num_clusters:]
|
197 |
|
198 |
chain = prompt | llm | StrOutputParser()
|
|
|
201 |
"context": ' '.join([split_contents[i] for i in top_indices])
|
202 |
})
|
203 |
|
204 |
+
|
205 |
@st.cache_data(show_spinner=False, ttl=3600)
|
206 |
@handle_errors
|
207 |
def process_pdf(_pdf_file_path):
|
|
|
227 |
|
228 |
for (x1, y1, x2, y2, cls) in boxes:
|
229 |
cropped = high_res_img[int(y1*scale_factor):int(y2*scale_factor),
|
230 |
+
int(x1*scale_factor):int(x2*scale_factor)]
|
231 |
if cls == 4:
|
232 |
all_figures.append(cropped)
|
233 |
else:
|
|
|
242 |
img.save(buffered, format="JPEG", quality=85)
|
243 |
return base64.b64encode(buffered.getvalue()).decode()
|
244 |
|
245 |
+
# ----------------------------
|
246 |
+
# Streamlit UI Setup
|
247 |
+
# ----------------------------
|
248 |
st.set_page_config(
|
249 |
page_title="PDF Assistant",
|
250 |
page_icon="π",
|
|
|
286 |
with open(file_path, "wb") as f:
|
287 |
f.write(uploaded_file.getbuffer())
|
288 |
|
289 |
+
# Let the user choose whether to include in-text citations in the summary
|
290 |
+
include_citations = st.checkbox("Include in-text citations in summary", value=True)
|
291 |
+
|
292 |
chat_container = st.container()
|
293 |
with chat_container:
|
294 |
for idx, chat in enumerate(st.session_state.chat_history):
|
|
|
309 |
if st.button("π Generate Summary", use_container_width=True):
|
310 |
with st.spinner("Analyzing document structure..."):
|
311 |
show_progress("Generating summary")
|
312 |
+
if include_citations:
|
313 |
+
summary = summarize_pdf_with_citations(file_path)
|
314 |
+
else:
|
315 |
+
summary = summarize_pdf(file_path)
|
316 |
st.session_state.chat_history.append({
|
317 |
"user": "Summary request",
|
318 |
"bot": f"## Document Summary\n{summary}"
|
|
|
380 |
padding: 2rem;
|
381 |
}
|
382 |
</style>
|
383 |
+
""", unsafe_allow_html=True)
|