github-actions[bot]
commited on
Commit
Β·
0cfb559
1
Parent(s):
fa4d37c
Sync with https://github.com/mozilla-ai/document-to-markdown
Browse files
app.py
CHANGED
@@ -1,50 +1,49 @@
|
|
|
|
1 |
from typing import Dict, Tuple
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
-
import torch.cuda
|
5 |
from docling.datamodel.base_models import InputFormat
|
6 |
-
from docling.datamodel.pipeline_options import
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
8 |
from docling_core.types import DoclingDocument
|
9 |
from docling.utils import model_downloader
|
10 |
-
from docling.datamodel.pipeline_options import smolvlm_picture_description
|
11 |
|
12 |
# Download models upon HF space initialization
|
13 |
-
pipeline_options = PdfPipelineOptions()
|
14 |
-
if torch.cuda.is_available():
|
15 |
-
print("Enabling CUDA Accelerator")
|
16 |
-
pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
17 |
-
pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
18 |
if os.getenv("IS_HF_SPACE"):
|
19 |
print("Downloading models...")
|
20 |
model_downloader.download_models()
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def parse_document(
|
24 |
file_path: str,
|
|
|
25 |
do_code_enrichment: bool,
|
26 |
do_formula_enrichment: bool,
|
27 |
-
do_picture_classification: bool,
|
28 |
-
do_picture_description: bool,
|
29 |
) -> Tuple[DoclingDocument, str]:
|
30 |
yield None, f"Parsing document... β³"
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
pipeline_options.do_picture_classification = do_picture_classification
|
37 |
-
|
38 |
-
pipeline_options.do_picture_description = do_picture_description
|
39 |
-
pipeline_options.picture_description_options = smolvlm_picture_description
|
40 |
-
pipeline_options.picture_description_options.prompt = "Describe the image in three sentences. Be concise and accurate."
|
41 |
-
pipeline_options.images_scale = 2.0
|
42 |
-
pipeline_options.generate_picture_images = True
|
43 |
|
44 |
-
print(f"Pipeline options defined: \n\t{
|
45 |
converter = DocumentConverter(
|
46 |
format_options={
|
47 |
-
InputFormat.PDF: PdfFormatOption(pipeline_options=
|
48 |
}
|
49 |
)
|
50 |
|
@@ -53,20 +52,31 @@ def parse_document(
|
|
53 |
yield result.document, "Done β
"
|
54 |
|
55 |
|
56 |
-
def to_html(docling_doc: DoclingDocument) -> str:
|
57 |
-
return docling_doc.export_to_html()
|
|
|
58 |
|
|
|
|
|
59 |
|
60 |
-
def to_markdown(docling_doc: DoclingDocument) -> str:
|
61 |
-
return docling_doc.export_to_markdown()
|
62 |
|
|
|
|
|
63 |
|
64 |
-
def to_json(docling_doc: DoclingDocument) -> Dict:
|
65 |
-
return docling_doc.export_to_dict()
|
66 |
|
|
|
|
|
67 |
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
|
72 |
def upload_file(file) -> str:
|
@@ -80,10 +90,6 @@ def setup_gradio_demo():
|
|
80 |
|
81 |
Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).
|
82 |
|
83 |
-
Model used for picture classification: [EfficientNet-B0 Document Image Classifier](https://huggingface.co/ds4sd/DocumentFigureClassifier)
|
84 |
-
|
85 |
-
Model used for picture description: [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
|
86 |
-
|
87 |
To explore the full set of features of Docling visit: https://github.com/docling-project/docling
|
88 |
"""
|
89 |
)
|
@@ -110,22 +116,18 @@ def setup_gradio_demo():
|
|
110 |
)
|
111 |
|
112 |
with gr.Column():
|
113 |
-
gr.Markdown("### 2) Configure engine
|
|
|
|
|
|
|
|
|
|
|
114 |
code_understanding = gr.Checkbox(
|
115 |
value=False, label="Enable Code understanding"
|
116 |
)
|
117 |
formula_enrichment = gr.Checkbox(
|
118 |
value=False, label="Enable Formula understanding"
|
119 |
)
|
120 |
-
picture_classification = gr.Checkbox(
|
121 |
-
value=False, label="Enable Picture classification"
|
122 |
-
)
|
123 |
-
picture_description = gr.Checkbox(
|
124 |
-
value=False, label="Enable Picture description"
|
125 |
-
)
|
126 |
-
gr.Markdown(
|
127 |
-
"_**Warning:** Enabling any of these features can potentially increase the processing time._"
|
128 |
-
)
|
129 |
|
130 |
parse_button = gr.Button("Parse document")
|
131 |
status = gr.Markdown()
|
@@ -136,40 +138,74 @@ def setup_gradio_demo():
|
|
136 |
markdown_button = gr.Button("Convert to markdown")
|
137 |
json_button = gr.Button("Convert to JSON")
|
138 |
text_button = gr.Button("Convert to text")
|
|
|
139 |
|
140 |
doc = gr.State()
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
parse_button.click(
|
144 |
fn=parse_document,
|
145 |
inputs=[
|
146 |
file_output,
|
|
|
147 |
code_understanding,
|
148 |
formula_enrichment,
|
149 |
-
picture_classification,
|
150 |
-
picture_description,
|
151 |
],
|
152 |
outputs=[doc, status],
|
153 |
)
|
154 |
html_button.click(
|
155 |
fn=to_html,
|
156 |
inputs=doc,
|
157 |
-
outputs=output,
|
158 |
)
|
159 |
markdown_button.click(
|
160 |
fn=to_markdown,
|
161 |
inputs=doc,
|
162 |
-
outputs=output,
|
163 |
)
|
164 |
json_button.click(
|
165 |
fn=to_json,
|
166 |
inputs=doc,
|
167 |
-
outputs=output,
|
168 |
)
|
169 |
text_button.click(
|
170 |
fn=to_text,
|
171 |
inputs=doc,
|
172 |
-
outputs=output,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
)
|
174 |
|
175 |
demo.launch()
|
|
|
1 |
+
import json
|
2 |
from typing import Dict, Tuple
|
3 |
import os
|
4 |
import gradio as gr
|
|
|
5 |
from docling.datamodel.base_models import InputFormat
|
6 |
+
from docling.datamodel.pipeline_options import (
|
7 |
+
PdfPipelineOptions,
|
8 |
+
EasyOcrOptions,
|
9 |
+
TesseractOcrOptions,
|
10 |
+
RapidOcrOptions,
|
11 |
+
OcrMacOptions,
|
12 |
+
)
|
13 |
from docling.document_converter import DocumentConverter, PdfFormatOption
|
14 |
from docling_core.types import DoclingDocument
|
15 |
from docling.utils import model_downloader
|
|
|
16 |
|
17 |
# Download models upon HF space initialization
|
|
|
|
|
|
|
|
|
|
|
18 |
if os.getenv("IS_HF_SPACE"):
|
19 |
print("Downloading models...")
|
20 |
model_downloader.download_models()
|
21 |
|
22 |
+
engines_available = {
|
23 |
+
"EasyOCR (Default)": EasyOcrOptions(),
|
24 |
+
"Tesseract": TesseractOcrOptions(),
|
25 |
+
"RapidOCR": RapidOcrOptions(),
|
26 |
+
"OcrMac (Mac only)": OcrMacOptions(),
|
27 |
+
}
|
28 |
+
|
29 |
|
30 |
def parse_document(
|
31 |
file_path: str,
|
32 |
+
engine: str,
|
33 |
do_code_enrichment: bool,
|
34 |
do_formula_enrichment: bool,
|
|
|
|
|
35 |
) -> Tuple[DoclingDocument, str]:
|
36 |
yield None, f"Parsing document... β³"
|
37 |
|
38 |
+
pdf_pipeline_options = PdfPipelineOptions()
|
39 |
+
pdf_pipeline_options.ocr_options = engines_available[engine]
|
40 |
+
pdf_pipeline_options.do_code_enrichment = do_code_enrichment
|
41 |
+
pdf_pipeline_options.do_formula_enrichment = do_formula_enrichment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
print(f"PDF Pipeline options defined: \n\t{pdf_pipeline_options}")
|
44 |
converter = DocumentConverter(
|
45 |
format_options={
|
46 |
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options)
|
47 |
}
|
48 |
)
|
49 |
|
|
|
52 |
yield result.document, "Done β
"
|
53 |
|
54 |
|
55 |
+
def to_html(docling_doc: DoclingDocument) -> Tuple[str, str]:
|
56 |
+
return docling_doc.export_to_html(), "html"
|
57 |
+
|
58 |
|
59 |
+
def to_markdown(docling_doc: DoclingDocument) -> Tuple[str, str]:
|
60 |
+
return docling_doc.export_to_markdown(), "md"
|
61 |
|
|
|
|
|
62 |
|
63 |
+
def to_json(docling_doc: DoclingDocument) -> Tuple[Dict, str]:
|
64 |
+
return docling_doc.export_to_dict(), "json"
|
65 |
|
|
|
|
|
66 |
|
67 |
+
def to_text(docling_doc: DoclingDocument) -> Tuple[str, str]:
|
68 |
+
return docling_doc.export_to_text(), "txt"
|
69 |
|
70 |
+
|
71 |
+
def download_file(doc: str | Dict, file_extension: str):
|
72 |
+
final_filename = f"doc.{file_extension}"
|
73 |
+
if file_extension == "json":
|
74 |
+
with open(final_filename, "w") as json_file:
|
75 |
+
json.dump(doc, json_file, indent=4)
|
76 |
+
else:
|
77 |
+
with open(final_filename, "w") as file:
|
78 |
+
file.write(doc)
|
79 |
+
return [final_filename, "Downloaded β
"]
|
80 |
|
81 |
|
82 |
def upload_file(file) -> str:
|
|
|
90 |
|
91 |
Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).
|
92 |
|
|
|
|
|
|
|
|
|
93 |
To explore the full set of features of Docling visit: https://github.com/docling-project/docling
|
94 |
"""
|
95 |
)
|
|
|
116 |
)
|
117 |
|
118 |
with gr.Column():
|
119 |
+
gr.Markdown("### 2) Configure engine (Only applicable for PDF files)")
|
120 |
+
|
121 |
+
ocr_engine = gr.Dropdown(
|
122 |
+
choices=list(engines_available.keys()), label="Select OCR engine"
|
123 |
+
)
|
124 |
+
|
125 |
code_understanding = gr.Checkbox(
|
126 |
value=False, label="Enable Code understanding"
|
127 |
)
|
128 |
formula_enrichment = gr.Checkbox(
|
129 |
value=False, label="Enable Formula understanding"
|
130 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
parse_button = gr.Button("Parse document")
|
133 |
status = gr.Markdown()
|
|
|
138 |
markdown_button = gr.Button("Convert to markdown")
|
139 |
json_button = gr.Button("Convert to JSON")
|
140 |
text_button = gr.Button("Convert to text")
|
141 |
+
file_extension = gr.Text(visible=False)
|
142 |
|
143 |
doc = gr.State()
|
144 |
+
with gr.Column():
|
145 |
+
with gr.Group():
|
146 |
+
output = gr.Textbox(
|
147 |
+
label="Output",
|
148 |
+
lines=10,
|
149 |
+
interactive=False,
|
150 |
+
elem_id="output-textbox",
|
151 |
+
)
|
152 |
+
gr.HTML(
|
153 |
+
"""
|
154 |
+
<div style="display: flex; flex-direction: column; align-items: center;">
|
155 |
+
<button id="copy-button" onclick="const text = document.getElementById('output-textbox').querySelector('textarea').value; navigator.clipboard.writeText(text); const copiedMsg = document.getElementById('copied-msg'); copiedMsg.style.display = 'inline'; setTimeout(() => copiedMsg.style.display = 'none', 1500);" style="margin-top: 10px;">
|
156 |
+
π Copy output to clipboard
|
157 |
+
</button>
|
158 |
+
<span id="copied-msg" style="margin-left: 10px; color: green; display: none;">Copied!</span>
|
159 |
+
</div>
|
160 |
+
"""
|
161 |
+
)
|
162 |
+
|
163 |
+
download_button = gr.Button("Download to file")
|
164 |
+
# See https://github.com/gradio-app/gradio/issues/9230#issuecomment-2323771634 why this button
|
165 |
+
download_button_hidden = gr.DownloadButton(
|
166 |
+
visible=False, elem_id="download_btn_hidden"
|
167 |
+
)
|
168 |
+
download_status = gr.Markdown()
|
169 |
|
170 |
parse_button.click(
|
171 |
fn=parse_document,
|
172 |
inputs=[
|
173 |
file_output,
|
174 |
+
ocr_engine,
|
175 |
code_understanding,
|
176 |
formula_enrichment,
|
|
|
|
|
177 |
],
|
178 |
outputs=[doc, status],
|
179 |
)
|
180 |
html_button.click(
|
181 |
fn=to_html,
|
182 |
inputs=doc,
|
183 |
+
outputs=[output, file_extension],
|
184 |
)
|
185 |
markdown_button.click(
|
186 |
fn=to_markdown,
|
187 |
inputs=doc,
|
188 |
+
outputs=[output, file_extension],
|
189 |
)
|
190 |
json_button.click(
|
191 |
fn=to_json,
|
192 |
inputs=doc,
|
193 |
+
outputs=[output, file_extension],
|
194 |
)
|
195 |
text_button.click(
|
196 |
fn=to_text,
|
197 |
inputs=doc,
|
198 |
+
outputs=[output, file_extension],
|
199 |
+
)
|
200 |
+
download_button.click(
|
201 |
+
fn=download_file,
|
202 |
+
inputs=[output, file_extension],
|
203 |
+
outputs=[download_button_hidden, download_status],
|
204 |
+
).then(
|
205 |
+
fn=None,
|
206 |
+
inputs=None,
|
207 |
+
outputs=None,
|
208 |
+
js="() => document.querySelector('#download_btn_hidden').click()",
|
209 |
)
|
210 |
|
211 |
demo.launch()
|