Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,13 @@ import torch
|
|
17 |
# import logging
|
18 |
# logging.basicConfig(level=logging.INFO)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# ---------------------------------------------------------------------------------------
|
21 |
# API Configuration
|
22 |
# ---------------------------------------------------------------------------------------
|
@@ -276,19 +283,29 @@ st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
|
|
276 |
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
|
277 |
|
278 |
if uploaded_file:
|
279 |
-
|
280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
-
|
283 |
-
for idx, image in enumerate(images):
|
284 |
-
markdown_text = extract_markdown_from_image(image)
|
285 |
-
markdown_texts.append(markdown_text)
|
286 |
|
287 |
-
|
|
|
|
|
|
|
288 |
|
289 |
-
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
-
# Check if extraction was successful
|
292 |
if df.empty or df['Document_Text'].isnull().all():
|
293 |
st.error("No meaningful text extracted from the PDF.")
|
294 |
st.stop()
|
@@ -296,6 +313,12 @@ if uploaded_file:
|
|
296 |
st.markdown("### Extracted Markdown Preview")
|
297 |
st.write(df.head())
|
298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
# ---------------------------------------------------------------------------------------
|
300 |
# User Input for Topics
|
301 |
# ---------------------------------------------------------------------------------------
|
|
|
17 |
# import logging
|
18 |
# logging.basicConfig(level=logging.INFO)
|
19 |
|
20 |
+
if 'pdf_processed' not in st.session_state:
|
21 |
+
st.session_state['pdf_processed'] = False
|
22 |
+
if 'markdown_texts' not in st.session_state:
|
23 |
+
st.session_state['markdown_texts'] = []
|
24 |
+
if 'df' not in st.session_state:
|
25 |
+
st.session_state['df'] = pd.DataFrame()
|
26 |
+
|
27 |
# ---------------------------------------------------------------------------------------
|
28 |
# API Configuration
|
29 |
# ---------------------------------------------------------------------------------------
|
|
|
283 |
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
|
284 |
|
285 |
if uploaded_file:
|
286 |
+
if not st.session_state['pdf_processed']:
|
287 |
+
with st.spinner("Processing PDF..."):
|
288 |
+
images = convert_pdf_to_images(uploaded_file)
|
289 |
+
|
290 |
+
markdown_texts = []
|
291 |
+
for idx, image in enumerate(images):
|
292 |
+
markdown_text = extract_markdown_from_image(image)
|
293 |
+
markdown_texts.append(markdown_text)
|
294 |
|
295 |
+
df = pd.DataFrame({'Document_Text': markdown_texts})
|
|
|
|
|
|
|
296 |
|
297 |
+
# Save results into session state
|
298 |
+
st.session_state['markdown_texts'] = markdown_texts
|
299 |
+
st.session_state['df'] = df
|
300 |
+
st.session_state['pdf_processed'] = True
|
301 |
|
302 |
+
st.success("PDF processed successfully!")
|
303 |
+
else:
|
304 |
+
st.success("PDF already processed. Using cached results.")
|
305 |
+
|
306 |
+
# Use cached dataframe for further processing
|
307 |
+
df = st.session_state['df']
|
308 |
|
|
|
309 |
if df.empty or df['Document_Text'].isnull().all():
|
310 |
st.error("No meaningful text extracted from the PDF.")
|
311 |
st.stop()
|
|
|
313 |
st.markdown("### Extracted Markdown Preview")
|
314 |
st.write(df.head())
|
315 |
|
316 |
+
if st.button("Reset / Upload New PDF"):
|
317 |
+
st.session_state['pdf_processed'] = False
|
318 |
+
st.session_state['markdown_texts'] = []
|
319 |
+
st.session_state['df'] = pd.DataFrame()
|
320 |
+
st.experimental_rerun()
|
321 |
+
|
322 |
# ---------------------------------------------------------------------------------------
|
323 |
# User Input for Topics
|
324 |
# ---------------------------------------------------------------------------------------
|