Spaces:

mozilla-ai
/

document-to-markdown

Running

App Files Files Community

github-actions[bot] commited on 13 days ago

Commit

0cfb559

1 Parent(s): fa4d37c

Sync with https://github.com/mozilla-ai/document-to-markdown

Browse files

Files changed (1) hide show

app.py +88 -52

app.py CHANGED Viewed

@@ -1,50 +1,49 @@
 from typing import Dict, Tuple
 import os
 import gradio as gr
-import torch.cuda
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions, AcceleratorDevice
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling_core.types import DoclingDocument
 from docling.utils import model_downloader
-from docling.datamodel.pipeline_options import smolvlm_picture_description
 # Download models upon HF space initialization
-pipeline_options = PdfPipelineOptions()
-if torch.cuda.is_available():
-    print("Enabling CUDA Accelerator")
-    pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
-    pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
 if os.getenv("IS_HF_SPACE"):
     print("Downloading models...")
     model_downloader.download_models()
 def parse_document(
     file_path: str,
     do_code_enrichment: bool,
     do_formula_enrichment: bool,
-    do_picture_classification: bool,
-    do_picture_description: bool,
 ) -> Tuple[DoclingDocument, str]:
     yield None, f"Parsing document... ⏳"
-    pipeline_options.do_code_enrichment = do_code_enrichment
-    pipeline_options.do_formula_enrichment = do_formula_enrichment
-    pipeline_options.generate_picture_images = do_picture_classification
-    pipeline_options.images_scale = 2
-    pipeline_options.do_picture_classification = do_picture_classification
-    pipeline_options.do_picture_description = do_picture_description
-    pipeline_options.picture_description_options = smolvlm_picture_description
-    pipeline_options.picture_description_options.prompt = "Describe the image in three sentences. Be concise and accurate."
-    pipeline_options.images_scale = 2.0
-    pipeline_options.generate_picture_images = True
-    print(f"Pipeline options defined: \n\t{pipeline_options}")
     converter = DocumentConverter(
         format_options={
-            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
         }
     )
@@ -53,20 +52,31 @@ def parse_document(
     yield result.document, "Done ✅"
-def to_html(docling_doc: DoclingDocument) -> str:
-    return docling_doc.export_to_html()
-def to_markdown(docling_doc: DoclingDocument) -> str:
-    return docling_doc.export_to_markdown()
-def to_json(docling_doc: DoclingDocument) -> Dict:
-    return docling_doc.export_to_dict()
-def to_text(docling_doc: DoclingDocument) -> str:
-    return docling_doc.export_to_text()
 def upload_file(file) -> str:
@@ -80,10 +90,6 @@ def setup_gradio_demo():
             Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).
-            Model used for picture classification: [EfficientNet-B0 Document Image Classifier](https://huggingface.co/ds4sd/DocumentFigureClassifier)
-            Model used for picture description: [SmolVLM-256M-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
             To explore the full set of features of Docling visit: https://github.com/docling-project/docling
             """
         )
@@ -110,22 +116,18 @@ def setup_gradio_demo():
                 )
             with gr.Column():
-                gr.Markdown("### 2) Configure engine & Parse")
                 code_understanding = gr.Checkbox(
                     value=False, label="Enable Code understanding"
                 )
                 formula_enrichment = gr.Checkbox(
                     value=False, label="Enable Formula understanding"
                 )
-                picture_classification = gr.Checkbox(
-                    value=False, label="Enable Picture classification"
-                )
-                picture_description = gr.Checkbox(
-                    value=False, label="Enable Picture description"
-                )
-                gr.Markdown(
-                    "_**Warning:** Enabling any of these features can potentially increase the processing time._"
-                )
                 parse_button = gr.Button("Parse document")
                 status = gr.Markdown()
@@ -136,40 +138,74 @@ def setup_gradio_demo():
                 markdown_button = gr.Button("Convert to markdown")
                 json_button = gr.Button("Convert to JSON")
                 text_button = gr.Button("Convert to text")
         doc = gr.State()
-        output = gr.Text(label="Output")
         parse_button.click(
             fn=parse_document,
             inputs=[
                 file_output,
                 code_understanding,
                 formula_enrichment,
-                picture_classification,
-                picture_description,
             ],
             outputs=[doc, status],
         )
         html_button.click(
             fn=to_html,
             inputs=doc,
-            outputs=output,
         )
         markdown_button.click(
             fn=to_markdown,
             inputs=doc,
-            outputs=output,
         )
         json_button.click(
             fn=to_json,
             inputs=doc,
-            outputs=output,
         )
         text_button.click(
             fn=to_text,
             inputs=doc,
-            outputs=output,
         )
     demo.launch()

+import json
 from typing import Dict, Tuple
 import os
 import gradio as gr
 from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    EasyOcrOptions,
+    TesseractOcrOptions,
+    RapidOcrOptions,
+    OcrMacOptions,
+)
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling_core.types import DoclingDocument
 from docling.utils import model_downloader
 # Download models upon HF space initialization
 if os.getenv("IS_HF_SPACE"):
     print("Downloading models...")
     model_downloader.download_models()
+engines_available = {
+    "EasyOCR (Default)": EasyOcrOptions(),
+    "Tesseract": TesseractOcrOptions(),
+    "RapidOCR": RapidOcrOptions(),
+    "OcrMac (Mac only)": OcrMacOptions(),
+}
 def parse_document(
     file_path: str,
+    engine: str,
     do_code_enrichment: bool,
     do_formula_enrichment: bool,
 ) -> Tuple[DoclingDocument, str]:
     yield None, f"Parsing document... ⏳"
+    pdf_pipeline_options = PdfPipelineOptions()
+    pdf_pipeline_options.ocr_options = engines_available[engine]
+    pdf_pipeline_options.do_code_enrichment = do_code_enrichment
+    pdf_pipeline_options.do_formula_enrichment = do_formula_enrichment
+    print(f"PDF Pipeline options defined: \n\t{pdf_pipeline_options}")
     converter = DocumentConverter(
         format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options)
         }
     )
     yield result.document, "Done ✅"
+def to_html(docling_doc: DoclingDocument) -> Tuple[str, str]:
+    return docling_doc.export_to_html(), "html"
+def to_markdown(docling_doc: DoclingDocument) -> Tuple[str, str]:
+    return docling_doc.export_to_markdown(), "md"
+def to_json(docling_doc: DoclingDocument) -> Tuple[Dict, str]:
+    return docling_doc.export_to_dict(), "json"
+def to_text(docling_doc: DoclingDocument) -> Tuple[str, str]:
+    return docling_doc.export_to_text(), "txt"
+def download_file(doc: str | Dict, file_extension: str):
+    final_filename = f"doc.{file_extension}"
+    if file_extension == "json":
+        with open(final_filename, "w") as json_file:
+            json.dump(doc, json_file, indent=4)
+    else:
+        with open(final_filename, "w") as file:
+            file.write(doc)
+    return [final_filename, "Downloaded ✅"]
 def upload_file(file) -> str:
             Docling is very powerful tool, with lots of cool features and integrations to other AI frameworks (e.g. LlamaIndex, LangChain, and many more).
             To explore the full set of features of Docling visit: https://github.com/docling-project/docling
             """
         )
                 )
             with gr.Column():
+                gr.Markdown("### 2) Configure engine (Only applicable for PDF files)")
+                ocr_engine = gr.Dropdown(
+                    choices=list(engines_available.keys()), label="Select OCR engine"
+                )
                 code_understanding = gr.Checkbox(
                     value=False, label="Enable Code understanding"
                 )
                 formula_enrichment = gr.Checkbox(
                     value=False, label="Enable Formula understanding"
                 )
                 parse_button = gr.Button("Parse document")
                 status = gr.Markdown()
                 markdown_button = gr.Button("Convert to markdown")
                 json_button = gr.Button("Convert to JSON")
                 text_button = gr.Button("Convert to text")
+                file_extension = gr.Text(visible=False)
         doc = gr.State()
+        with gr.Column():
+            with gr.Group():
+                output = gr.Textbox(
+                    label="Output",
+                    lines=10,
+                    interactive=False,
+                    elem_id="output-textbox",
+                )
+                gr.HTML(
+                    """
+                    <div style="display: flex; flex-direction: column; align-items: center;">
+                      <button id="copy-button" onclick="const text = document.getElementById('output-textbox').querySelector('textarea').value; navigator.clipboard.writeText(text); const copiedMsg = document.getElementById('copied-msg'); copiedMsg.style.display = 'inline'; setTimeout(() => copiedMsg.style.display = 'none', 1500);" style="margin-top: 10px;">
+                        📋 Copy output to clipboard
+                      </button>
+                      <span id="copied-msg" style="margin-left: 10px; color: green; display: none;">Copied!</span>
+                    </div>
+                    """
+                )
+        download_button = gr.Button("Download to file")
+        # See https://github.com/gradio-app/gradio/issues/9230#issuecomment-2323771634 why this button
+        download_button_hidden = gr.DownloadButton(
+            visible=False, elem_id="download_btn_hidden"
+        )
+        download_status = gr.Markdown()
         parse_button.click(
             fn=parse_document,
             inputs=[
                 file_output,
+                ocr_engine,
                 code_understanding,
                 formula_enrichment,
             ],
             outputs=[doc, status],
         )
         html_button.click(
             fn=to_html,
             inputs=doc,
+            outputs=[output, file_extension],
         )
         markdown_button.click(
             fn=to_markdown,
             inputs=doc,
+            outputs=[output, file_extension],
         )
         json_button.click(
             fn=to_json,
             inputs=doc,
+            outputs=[output, file_extension],
         )
         text_button.click(
             fn=to_text,
             inputs=doc,
+            outputs=[output, file_extension],
+        )
+        download_button.click(
+            fn=download_file,
+            inputs=[output, file_extension],
+            outputs=[download_button_hidden, download_status],
+        ).then(
+            fn=None,
+            inputs=None,
+            outputs=None,
+            js="() => document.querySelector('#download_btn_hidden').click()",
         )
     demo.launch()