ignaciaginting commited on
Commit
b39b068
·
verified ·
1 Parent(s): e205139
Files changed (1) hide show
  1. app.py +22 -12
app.py CHANGED
@@ -1,17 +1,27 @@
1
  import gradio as gr
2
- import os
 
3
  from huggingface_hub import snapshot_download
4
- from pdf_extract_kit import extract_text # Assuming this function exists in the toolkit
 
 
 
5
 
6
- # Ensure the model is downloaded
7
- model_dir = "./PDF-Extract-Kit-1.0"
8
- if not os.path.exists(model_dir):
9
- snapshot_download(repo_id='opendatalab/PDF-Extract-Kit-1.0', local_dir=model_dir, max_workers=20)
 
10
 
11
- def process_pdf(file):
12
- # Implement your PDF processing logic here using PDF-Extract-Kit
13
- extracted_text = extract_text(file.name, model_dir=model_dir)
14
- return extracted_text
15
 
16
- iface = gr.Interface(fn=process_pdf, inputs=gr.File(type="binary"), outputs="text", title="PDF Extractor")
17
- iface.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
+ from modelscope.pipelines import pipeline
3
+ from modelscope.utils.constant import Tasks
4
  from huggingface_hub import snapshot_download
5
+ import os
6
+
7
+ # Step 1: Download the model
8
+ model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0')
9
 
10
+ # Step 2: Initialize pipeline
11
+ pipe = pipeline(
12
+ task=Tasks.document_segmentation,
13
+ model=model_dir
14
+ )
15
 
16
+ # Step 3: Define inference function
17
+ def extract_info_from_pdf(pdf_file):
18
+ result = pipe({'file': pdf_file.name})
19
+ return str(result)
20
 
21
+ # Step 4: Gradio UI
22
+ gr.Interface(
23
+ fn=extract_info_from_pdf,
24
+ inputs=gr.File(type="binary", label="Upload PDF"),
25
+ outputs="text",
26
+ title="PDF Extractor (PDF-Extract-Kit)"
27
+ ).launch()