Spaces:

Ansemin101
/

Markit

Sleeping

File size: 6,124 Bytes

from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import json
import os
import shutil

from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.models.tesseract_ocr_model import TesseractOcrOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions


class DoclingParser(DocumentParser):
    """Parser implementation using Docling."""
    
    @classmethod
    def get_name(cls) -> str:
        return "Docling"
    
    @classmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        return [
            {
                "id": "no_ocr",
                "name": "No OCR",
                "default_params": {}
            },
            {
                "id": "easyocr",
                "name": "EasyOCR",
                "default_params": {"languages": ["en"]}
            },
            {
                "id": "easyocr_cpu",
                "name": "EasyOCR (CPU only)",
                "default_params": {"languages": ["en"], "use_gpu": False}
            },
            {
                "id": "tesseract",
                "name": "Tesseract",
                "default_params": {}
            },
            {
                "id": "tesseract_cli",
                "name": "Tesseract CLI",
                "default_params": {}
            },
            {
                "id": "full_force_ocr",
                "name": "Full Force OCR",
                "default_params": {}
            }
        ]
    
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """Parse a document using Docling."""
        # Special case for full force OCR
        if ocr_method == "full_force_ocr":
            return self._apply_full_force_ocr(file_path)
        
        # Regular Docling parsing
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        
        # Configure OCR based on the method
        if ocr_method == "no_ocr":
            pipeline_options.do_ocr = False
        elif ocr_method == "easyocr":
            pipeline_options.do_ocr = True
            pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
            pipeline_options.accelerator_options = AcceleratorOptions(
                num_threads=4, device=AcceleratorDevice.AUTO
            )
        elif ocr_method == "easyocr_cpu":
            pipeline_options.do_ocr = True
            pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
            pipeline_options.ocr_options.use_gpu = False
        elif ocr_method == "tesseract":
            pipeline_options.do_ocr = True
            pipeline_options.ocr_options = TesseractOcrOptions()
        elif ocr_method == "tesseract_cli":
            pipeline_options.do_ocr = True
            pipeline_options.ocr_options = TesseractCliOcrOptions()
        
        # Create the converter
        converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options
                )
            }
        )
        
        # Convert the document
        result = converter.convert(Path(file_path))
        doc = result.document
        
        # Return the content in the requested format
        output_format = kwargs.get("output_format", "markdown")
        if output_format.lower() == "json":
            return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
        elif output_format.lower() == "text":
            return doc.export_to_text()
        elif output_format.lower() == "document_tags":
            return doc.export_to_document_tokens()
        else:
            return doc.export_to_markdown()
    
    def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
        """Apply full force OCR to a document."""
        input_doc = Path(file_path)
        file_extension = input_doc.suffix.lower()
        
        # Debug information
        print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
        
        # Basic pipeline setup
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_ocr = True
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        
        # Find tesseract executable
        tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
        print(f"Using tesseract at: {tesseract_path}")
        
        # Configure OCR options
        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)  # Using standard options instead of CLI
        pipeline_options.ocr_options = ocr_options
        
        # Set up format options based on file type
        format_options = {
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
        
        # Handle image files
        if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
            print(f"Processing as image file: {file_extension}")
            format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
        
        # Try full force OCR with standard options
        try:
            converter = DocumentConverter(format_options=format_options)
            result = converter.convert(input_doc)
            return result.document.export_to_markdown()
        except Exception as e:
            print(f"Error with standard OCR: {e}")
            print(f"Attempting fallback to tesseract_cli OCR...")
            return self.parse(file_path, ocr_method="tesseract_cli")


# Register the parser with the registry
ParserRegistry.register(DoclingParser)