Markit / src /parsers /docling_parser.py
AnseMin's picture
removing ocrmac option
8af1511
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import json
import os
import shutil
from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
)
from docling.models.tesseract_ocr_model import TesseractOcrOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
class DoclingParser(DocumentParser):
"""Parser implementation using Docling."""
@classmethod
def get_name(cls) -> str:
return "Docling"
@classmethod
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
return [
{
"id": "no_ocr",
"name": "No OCR",
"default_params": {}
},
{
"id": "easyocr",
"name": "EasyOCR",
"default_params": {"languages": ["en"]}
},
{
"id": "easyocr_cpu",
"name": "EasyOCR (CPU only)",
"default_params": {"languages": ["en"], "use_gpu": False}
},
{
"id": "tesseract",
"name": "Tesseract",
"default_params": {}
},
{
"id": "tesseract_cli",
"name": "Tesseract CLI",
"default_params": {}
},
{
"id": "full_force_ocr",
"name": "Full Force OCR",
"default_params": {}
}
]
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
"""Parse a document using Docling."""
# Special case for full force OCR
if ocr_method == "full_force_ocr":
return self._apply_full_force_ocr(file_path)
# Regular Docling parsing
pipeline_options = PdfPipelineOptions()
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Configure OCR based on the method
if ocr_method == "no_ocr":
pipeline_options.do_ocr = False
elif ocr_method == "easyocr":
pipeline_options.do_ocr = True
pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=4, device=AcceleratorDevice.AUTO
)
elif ocr_method == "easyocr_cpu":
pipeline_options.do_ocr = True
pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
pipeline_options.ocr_options.use_gpu = False
elif ocr_method == "tesseract":
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractOcrOptions()
elif ocr_method == "tesseract_cli":
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractCliOcrOptions()
# Create the converter
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
)
}
)
# Convert the document
result = converter.convert(Path(file_path))
doc = result.document
# Return the content in the requested format
output_format = kwargs.get("output_format", "markdown")
if output_format.lower() == "json":
return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format.lower() == "text":
return doc.export_to_text()
elif output_format.lower() == "document_tags":
return doc.export_to_document_tokens()
else:
return doc.export_to_markdown()
def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
"""Apply full force OCR to a document."""
input_doc = Path(file_path)
file_extension = input_doc.suffix.lower()
# Debug information
print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
# Basic pipeline setup
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Find tesseract executable
tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
print(f"Using tesseract at: {tesseract_path}")
# Configure OCR options
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) # Using standard options instead of CLI
pipeline_options.ocr_options = ocr_options
# Set up format options based on file type
format_options = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
# Handle image files
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
print(f"Processing as image file: {file_extension}")
format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
# Try full force OCR with standard options
try:
converter = DocumentConverter(format_options=format_options)
result = converter.convert(input_doc)
return result.document.export_to_markdown()
except Exception as e:
print(f"Error with standard OCR: {e}")
print(f"Attempting fallback to tesseract_cli OCR...")
return self.parse(file_path, ocr_method="tesseract_cli")
# Register the parser with the registry
ParserRegistry.register(DoclingParser)