github-actions[bot] commited on
Commit
0cfb559
Β·
1 Parent(s): fa4d37c

Sync with https://github.com/mozilla-ai/document-to-markdown

Browse files
Files changed (1) hide show
  1. app.py +88 -52
app.py CHANGED
@@ -1,50 +1,49 @@
 
1
  from typing import Dict, Tuple
2
  import os
3
  import gradio as gr
4
- import torch.cuda
5
  from docling.datamodel.base_models import InputFormat
6
- from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorDevice
 
 
 
 
 
 
7
  from docling.document_converter import DocumentConverter, PdfFormatOption
8
  from docling_core.types import DoclingDocument
9
  from docling.utils import model_downloader
10
- from docling.datamodel.pipeline_options import smolvlm_picture_description
11
 
12
  # Download models upon HF space initialization
13
- pipeline_options = PdfPipelineOptions()
14
- if torch.cuda.is_available():
15
- print("Enabling CUDA Accelerator")
16
- pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
17
- pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
18
  if os.getenv("IS_HF_SPACE"):
19
  print("Downloading models...")
20
  model_downloader.download_models()
21
 
 
 
 
 
 
 
 
22
 
23
  def parse_document(
24
  file_path: str,
 
25
  do_code_enrichment: bool,
26
  do_formula_enrichment: bool,
27
- do_picture_classification: bool,
28
- do_picture_description: bool,
29
  ) -> Tuple[DoclingDocument, str]:
30
  yield None, f"Parsing document... ⏳"
31
 
32
- pipeline_options.do_code_enrichment = do_code_enrichment
33
- pipeline_options.do_formula_enrichment = do_formula_enrichment
34
- pipeline_options.generate_picture_images = do_picture_classification
35
- pipeline_options.images_scale = 2
36
- pipeline_options.do_picture_classification = do_picture_classification
37
-
38
- pipeline_options.do_picture_description = do_picture_description
39
- pipeline_options.picture_description_options = smolvlm_picture_description
40
- pipeline_options.picture_description_options.prompt = "Describe the image in three sentences. Be concise and accurate."
41
- pipeline_options.images_scale = 2.0
42
- pipeline_options.generate_picture_images = True
43
 
44
- print(f"Pipeline options defined: \n\t{pipeline_options}")
45
  converter = DocumentConverter(
46
  format_options={
47
- InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
48
  }
49
  )
50
 
@@ -53,20 +52,31 @@ def parse_document(
53
  yield result.document, "Done βœ…"
54
 
55
 
56
- def to_html(docling_doc: DoclingDocument) -> str:
57
- return docling_doc.export_to_html()
 
58
 
 
 
59
 
60
- def to_markdown(docling_doc: DoclingDocument) -> str:
61
- return docling_doc.export_to_markdown()
62
 
 
 
63
 
64
- def to_json(docling_doc: DoclingDocument) -> Dict:
65
- return docling_doc.export_to_dict()
66
 
 
 
67
 
68
- def to_text(docling_doc: DoclingDocument) -> str:
69
- return docling_doc.export_to_text()
 
 
 
 
 
 
 
 
70
 
71
 
72
  def upload_file(file) -> str:
@@ -80,10 +90,6 @@ def setup_gradio_demo():
80
 
81
  Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).
82
 
83
- Model used for picture classification: [EfficientNet-B0 Document Image Classifier](https://huggingface.co/ds4sd/DocumentFigureClassifier)
84
-
85
- Model used for picture description: [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
86
-
87
  To explore the full set of features of Docling visit: https://github.com/docling-project/docling
88
  """
89
  )
@@ -110,22 +116,18 @@ def setup_gradio_demo():
110
  )
111
 
112
  with gr.Column():
113
- gr.Markdown("### 2) Configure engine & Parse")
 
 
 
 
 
114
  code_understanding = gr.Checkbox(
115
  value=False, label="Enable Code understanding"
116
  )
117
  formula_enrichment = gr.Checkbox(
118
  value=False, label="Enable Formula understanding"
119
  )
120
- picture_classification = gr.Checkbox(
121
- value=False, label="Enable Picture classification"
122
- )
123
- picture_description = gr.Checkbox(
124
- value=False, label="Enable Picture description"
125
- )
126
- gr.Markdown(
127
- "_**Warning:** Enabling any of these features can potentially increase the processing time._"
128
- )
129
 
130
  parse_button = gr.Button("Parse document")
131
  status = gr.Markdown()
@@ -136,40 +138,74 @@ def setup_gradio_demo():
136
  markdown_button = gr.Button("Convert to markdown")
137
  json_button = gr.Button("Convert to JSON")
138
  text_button = gr.Button("Convert to text")
 
139
 
140
  doc = gr.State()
141
- output = gr.Text(label="Output")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  parse_button.click(
144
  fn=parse_document,
145
  inputs=[
146
  file_output,
 
147
  code_understanding,
148
  formula_enrichment,
149
- picture_classification,
150
- picture_description,
151
  ],
152
  outputs=[doc, status],
153
  )
154
  html_button.click(
155
  fn=to_html,
156
  inputs=doc,
157
- outputs=output,
158
  )
159
  markdown_button.click(
160
  fn=to_markdown,
161
  inputs=doc,
162
- outputs=output,
163
  )
164
  json_button.click(
165
  fn=to_json,
166
  inputs=doc,
167
- outputs=output,
168
  )
169
  text_button.click(
170
  fn=to_text,
171
  inputs=doc,
172
- outputs=output,
 
 
 
 
 
 
 
 
 
 
173
  )
174
 
175
  demo.launch()
 
1
+ import json
2
  from typing import Dict, Tuple
3
  import os
4
  import gradio as gr
 
5
  from docling.datamodel.base_models import InputFormat
6
+ from docling.datamodel.pipeline_options import (
7
+ PdfPipelineOptions,
8
+ EasyOcrOptions,
9
+ TesseractOcrOptions,
10
+ RapidOcrOptions,
11
+ OcrMacOptions,
12
+ )
13
  from docling.document_converter import DocumentConverter, PdfFormatOption
14
  from docling_core.types import DoclingDocument
15
  from docling.utils import model_downloader
 
16
 
17
  # Download models upon HF space initialization
 
 
 
 
 
18
  if os.getenv("IS_HF_SPACE"):
19
  print("Downloading models...")
20
  model_downloader.download_models()
21
 
22
+ engines_available = {
23
+ "EasyOCR (Default)": EasyOcrOptions(),
24
+ "Tesseract": TesseractOcrOptions(),
25
+ "RapidOCR": RapidOcrOptions(),
26
+ "OcrMac (Mac only)": OcrMacOptions(),
27
+ }
28
+
29
 
30
  def parse_document(
31
  file_path: str,
32
+ engine: str,
33
  do_code_enrichment: bool,
34
  do_formula_enrichment: bool,
 
 
35
  ) -> Tuple[DoclingDocument, str]:
36
  yield None, f"Parsing document... ⏳"
37
 
38
+ pdf_pipeline_options = PdfPipelineOptions()
39
+ pdf_pipeline_options.ocr_options = engines_available[engine]
40
+ pdf_pipeline_options.do_code_enrichment = do_code_enrichment
41
+ pdf_pipeline_options.do_formula_enrichment = do_formula_enrichment
 
 
 
 
 
 
 
42
 
43
+ print(f"PDF Pipeline options defined: \n\t{pdf_pipeline_options}")
44
  converter = DocumentConverter(
45
  format_options={
46
+ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options)
47
  }
48
  )
49
 
 
52
  yield result.document, "Done βœ…"
53
 
54
 
55
+ def to_html(docling_doc: DoclingDocument) -> Tuple[str, str]:
56
+ return docling_doc.export_to_html(), "html"
57
+
58
 
59
+ def to_markdown(docling_doc: DoclingDocument) -> Tuple[str, str]:
60
+ return docling_doc.export_to_markdown(), "md"
61
 
 
 
62
 
63
+ def to_json(docling_doc: DoclingDocument) -> Tuple[Dict, str]:
64
+ return docling_doc.export_to_dict(), "json"
65
 
 
 
66
 
67
+ def to_text(docling_doc: DoclingDocument) -> Tuple[str, str]:
68
+ return docling_doc.export_to_text(), "txt"
69
 
70
+
71
+ def download_file(doc: str | Dict, file_extension: str):
72
+ final_filename = f"doc.{file_extension}"
73
+ if file_extension == "json":
74
+ with open(final_filename, "w") as json_file:
75
+ json.dump(doc, json_file, indent=4)
76
+ else:
77
+ with open(final_filename, "w") as file:
78
+ file.write(doc)
79
+ return [final_filename, "Downloaded βœ…"]
80
 
81
 
82
  def upload_file(file) -> str:
 
90
 
91
  Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).
92
 
 
 
 
 
93
  To explore the full set of features of Docling visit: https://github.com/docling-project/docling
94
  """
95
  )
 
116
  )
117
 
118
  with gr.Column():
119
+ gr.Markdown("### 2) Configure engine (Only applicable for PDF files)")
120
+
121
+ ocr_engine = gr.Dropdown(
122
+ choices=list(engines_available.keys()), label="Select OCR engine"
123
+ )
124
+
125
  code_understanding = gr.Checkbox(
126
  value=False, label="Enable Code understanding"
127
  )
128
  formula_enrichment = gr.Checkbox(
129
  value=False, label="Enable Formula understanding"
130
  )
 
 
 
 
 
 
 
 
 
131
 
132
  parse_button = gr.Button("Parse document")
133
  status = gr.Markdown()
 
138
  markdown_button = gr.Button("Convert to markdown")
139
  json_button = gr.Button("Convert to JSON")
140
  text_button = gr.Button("Convert to text")
141
+ file_extension = gr.Text(visible=False)
142
 
143
  doc = gr.State()
144
+ with gr.Column():
145
+ with gr.Group():
146
+ output = gr.Textbox(
147
+ label="Output",
148
+ lines=10,
149
+ interactive=False,
150
+ elem_id="output-textbox",
151
+ )
152
+ gr.HTML(
153
+ """
154
+ <div style="display: flex; flex-direction: column; align-items: center;">
155
+ <button id="copy-button" onclick="const text = document.getElementById('output-textbox').querySelector('textarea').value; navigator.clipboard.writeText(text); const copiedMsg = document.getElementById('copied-msg'); copiedMsg.style.display = 'inline'; setTimeout(() => copiedMsg.style.display = 'none', 1500);" style="margin-top: 10px;">
156
+ πŸ“‹ Copy output to clipboard
157
+ </button>
158
+ <span id="copied-msg" style="margin-left: 10px; color: green; display: none;">Copied!</span>
159
+ </div>
160
+ """
161
+ )
162
+
163
+ download_button = gr.Button("Download to file")
164
+ # See https://github.com/gradio-app/gradio/issues/9230#issuecomment-2323771634 why this button
165
+ download_button_hidden = gr.DownloadButton(
166
+ visible=False, elem_id="download_btn_hidden"
167
+ )
168
+ download_status = gr.Markdown()
169
 
170
  parse_button.click(
171
  fn=parse_document,
172
  inputs=[
173
  file_output,
174
+ ocr_engine,
175
  code_understanding,
176
  formula_enrichment,
 
 
177
  ],
178
  outputs=[doc, status],
179
  )
180
  html_button.click(
181
  fn=to_html,
182
  inputs=doc,
183
+ outputs=[output, file_extension],
184
  )
185
  markdown_button.click(
186
  fn=to_markdown,
187
  inputs=doc,
188
+ outputs=[output, file_extension],
189
  )
190
  json_button.click(
191
  fn=to_json,
192
  inputs=doc,
193
+ outputs=[output, file_extension],
194
  )
195
  text_button.click(
196
  fn=to_text,
197
  inputs=doc,
198
+ outputs=[output, file_extension],
199
+ )
200
+ download_button.click(
201
+ fn=download_file,
202
+ inputs=[output, file_extension],
203
+ outputs=[download_button_hidden, download_status],
204
+ ).then(
205
+ fn=None,
206
+ inputs=None,
207
+ outputs=None,
208
+ js="() => document.querySelector('#download_btn_hidden').click()",
209
  )
210
 
211
  demo.launch()