ProfessorLeVesseur commited on
Commit
cce5718
·
verified ·
1 Parent(s): cc08b36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -9
app.py CHANGED
@@ -17,6 +17,13 @@ import torch
17
  # import logging
18
  # logging.basicConfig(level=logging.INFO)
19
 
 
 
 
 
 
 
 
20
  # ---------------------------------------------------------------------------------------
21
  # API Configuration
22
  # ---------------------------------------------------------------------------------------
@@ -276,19 +283,29 @@ st.title("Choose Your Own Adventure (Topic Extraction) PDF Analysis App")
276
  uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
277
 
278
  if uploaded_file:
279
- with st.spinner("Processing PDF..."):
280
- images = convert_pdf_to_images(uploaded_file)
 
 
 
 
 
 
281
 
282
- markdown_texts = []
283
- for idx, image in enumerate(images):
284
- markdown_text = extract_markdown_from_image(image)
285
- markdown_texts.append(markdown_text)
286
 
287
- df = pd.DataFrame({'Document_Text': markdown_texts})
 
 
 
288
 
289
- st.success("PDF processed successfully!")
 
 
 
 
 
290
 
291
- # Check if extraction was successful
292
  if df.empty or df['Document_Text'].isnull().all():
293
  st.error("No meaningful text extracted from the PDF.")
294
  st.stop()
@@ -296,6 +313,12 @@ if uploaded_file:
296
  st.markdown("### Extracted Markdown Preview")
297
  st.write(df.head())
298
 
 
 
 
 
 
 
299
  # ---------------------------------------------------------------------------------------
300
  # User Input for Topics
301
  # ---------------------------------------------------------------------------------------
 
17
  # import logging
18
  # logging.basicConfig(level=logging.INFO)
19
 
20
+ if 'pdf_processed' not in st.session_state:
21
+ st.session_state['pdf_processed'] = False
22
+ if 'markdown_texts' not in st.session_state:
23
+ st.session_state['markdown_texts'] = []
24
+ if 'df' not in st.session_state:
25
+ st.session_state['df'] = pd.DataFrame()
26
+
27
  # ---------------------------------------------------------------------------------------
28
  # API Configuration
29
  # ---------------------------------------------------------------------------------------
 
283
  uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
284
 
285
  if uploaded_file:
286
+ if not st.session_state['pdf_processed']:
287
+ with st.spinner("Processing PDF..."):
288
+ images = convert_pdf_to_images(uploaded_file)
289
+
290
+ markdown_texts = []
291
+ for idx, image in enumerate(images):
292
+ markdown_text = extract_markdown_from_image(image)
293
+ markdown_texts.append(markdown_text)
294
 
295
+ df = pd.DataFrame({'Document_Text': markdown_texts})
 
 
 
296
 
297
+ # Save results into session state
298
+ st.session_state['markdown_texts'] = markdown_texts
299
+ st.session_state['df'] = df
300
+ st.session_state['pdf_processed'] = True
301
 
302
+ st.success("PDF processed successfully!")
303
+ else:
304
+ st.success("PDF already processed. Using cached results.")
305
+
306
+ # Use cached dataframe for further processing
307
+ df = st.session_state['df']
308
 
 
309
  if df.empty or df['Document_Text'].isnull().all():
310
  st.error("No meaningful text extracted from the PDF.")
311
  st.stop()
 
313
  st.markdown("### Extracted Markdown Preview")
314
  st.write(df.head())
315
 
316
+ if st.button("Reset / Upload New PDF"):
317
+ st.session_state['pdf_processed'] = False
318
+ st.session_state['markdown_texts'] = []
319
+ st.session_state['df'] = pd.DataFrame()
320
+ st.experimental_rerun()
321
+
322
  # ---------------------------------------------------------------------------------------
323
  # User Input for Topics
324
  # ---------------------------------------------------------------------------------------