ignaciaginting commited on
Commit
adf0a0e
·
verified ·
1 Parent(s): 61daa02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -51
app.py CHANGED
@@ -1,53 +1,32 @@
1
  import streamlit as st
 
 
2
  import os
3
- import tempfile
4
- from huggingface_hub import snapshot_download
5
- from pdf2image import convert_from_path
6
- from PIL import Image
7
- import fitz # PyMuPDF
8
-
9
- # Step 1: Download model if not present
10
- MODEL_DIR = "./pdf-extract-kit"
11
- if not os.path.exists(MODEL_DIR):
12
- with st.spinner("Downloading model..."):
13
- snapshot_download(repo_id="opendatalab/pdf-extract-kit-1.0", local_dir=MODEL_DIR, max_workers=20)
14
-
15
- # Step 2: Import model logic dynamically
16
- import sys
17
- sys.path.append(MODEL_DIR + "/inference")
18
- try:
19
- from table_recognizer import TableRecognizer
20
- except ImportError:
21
- st.error("❌ Unable to load TableRecognizer. Check model directory structure.")
22
- st.stop()
23
-
24
- # Step 3: Set up recognizer
25
- table_model = TableRecognizer(
26
- model_dir=os.path.join(MODEL_DIR, "models", "table_recognition"),
27
- device="cpu" # Change to 'cuda' if using GPU
28
- )
29
-
30
- st.title("📄 PDF Table Extractor")
31
-
32
- uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
33
- if uploaded_file:
34
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
35
- tmp_pdf.write(uploaded_file.read())
36
- tmp_pdf_path = tmp_pdf.name
37
-
38
- images = convert_from_path(tmp_pdf_path)
39
-
40
- for i, img in enumerate(images):
41
- st.subheader(f"Page {i + 1}")
42
- st.image(img, caption="Original Page", use_column_width=True)
43
-
44
- # Step 4: Run Table Recognizer
45
- with st.spinner("Extracting tables..."):
46
- table_results = table_model(img) # This assumes model takes a PIL image and returns result
47
-
48
- if table_results:
49
- for idx, table in enumerate(table_results):
50
- st.markdown(f"#### Table {idx + 1}")
51
- st.dataframe(table["data"]) # Assuming table["data"] is a 2D list or pandas DataFrame
52
- else:
53
- st.info("No tables detected on this page.")
 
1
  import streamlit as st
2
+ from pdf_extract_kit.tasks.ocr import OCRTask
3
+ from pdf_extract_kit.utils.config_loader import load_config
4
  import os
5
+
6
+ # Streamlit app title
7
+ st.title("PDF Table Extraction")
8
+
9
+ # File uploader to upload PDF
10
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
11
+
12
+ if uploaded_file is not None:
13
+ # Save the uploaded file to a temporary location
14
+ with open("temp.pdf", "wb") as f:
15
+ f.write(uploaded_file.read())
16
+
17
+ # Configuration path for OCR task
18
+ config_path = "PDF-Extract-Kit/configs/ocr.yaml" # Updated config path
19
+ config = load_config(config_path)
20
+
21
+ # Initialize the OCR task
22
+ task = OCRTask(config)
23
+
24
+ # Perform OCR task on the uploaded PDF
25
+ extracted_data = task.process("temp.pdf", save_dir="outputs", visualize=True)
26
+
27
+ # Display the extracted values
28
+ st.write("Extracted Data:")
29
+ st.write(extracted_data)
30
+
31
+ # Optional: Visualize the result (depending on how the output is generated)
32
+ # st.image('path_to_visualization_image', caption='Extracted Table', use_column_width=True)