File size: 6,124 Bytes
27722f3
 
 
67baccc
 
27722f3
a370b95
 
27722f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67baccc
27722f3
67baccc
 
 
f1d63ad
27722f3
 
 
 
 
f1d63ad
 
 
27722f3
f1d63ad
7161f9e
f1d63ad
67baccc
f1d63ad
 
 
 
27722f3
99c8f7d
67baccc
 
f1d63ad
67baccc
f1d63ad
67baccc
f1d63ad
67baccc
f1d63ad
67baccc
f1d63ad
 
 
27722f3
 
 
3860890
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import json
import os
import shutil

from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.models.tesseract_ocr_model import TesseractOcrOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions


class DoclingParser(DocumentParser):
    """Parser implementation using Docling."""
    
    @classmethod
    def get_name(cls) -> str:
        return "Docling"
    
    @classmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        return [
            {
                "id": "no_ocr",
                "name": "No OCR",
                "default_params": {}
            },
            {
                "id": "easyocr",
                "name": "EasyOCR",
                "default_params": {"languages": ["en"]}
            },
            {
                "id": "easyocr_cpu",
                "name": "EasyOCR (CPU only)",
                "default_params": {"languages": ["en"], "use_gpu": False}
            },
            {
                "id": "tesseract",
                "name": "Tesseract",
                "default_params": {}
            },
            {
                "id": "tesseract_cli",
                "name": "Tesseract CLI",
                "default_params": {}
            },
            {
                "id": "full_force_ocr",
                "name": "Full Force OCR",
                "default_params": {}
            }
        ]
    
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """Parse a document using Docling."""
        # Special case for full force OCR
        if ocr_method == "full_force_ocr":
            return self._apply_full_force_ocr(file_path)
        
        # Regular Docling parsing
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        
        # Configure OCR based on the method
        if ocr_method == "no_ocr":
            pipeline_options.do_ocr = False
        elif ocr_method == "easyocr":
            pipeline_options.do_ocr = True
            pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
            pipeline_options.accelerator_options = AcceleratorOptions(
                num_threads=4, device=AcceleratorDevice.AUTO
            )
        elif ocr_method == "easyocr_cpu":
            pipeline_options.do_ocr = True
            pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
            pipeline_options.ocr_options.use_gpu = False
        elif ocr_method == "tesseract":
            pipeline_options.do_ocr = True
            pipeline_options.ocr_options = TesseractOcrOptions()
        elif ocr_method == "tesseract_cli":
            pipeline_options.do_ocr = True
            pipeline_options.ocr_options = TesseractCliOcrOptions()
        
        # Create the converter
        converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options
                )
            }
        )
        
        # Convert the document
        result = converter.convert(Path(file_path))
        doc = result.document
        
        # Return the content in the requested format
        output_format = kwargs.get("output_format", "markdown")
        if output_format.lower() == "json":
            return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
        elif output_format.lower() == "text":
            return doc.export_to_text()
        elif output_format.lower() == "document_tags":
            return doc.export_to_document_tokens()
        else:
            return doc.export_to_markdown()
    
    def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
        """Apply full force OCR to a document."""
        input_doc = Path(file_path)
        file_extension = input_doc.suffix.lower()
        
        # Debug information
        print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
        
        # Basic pipeline setup
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_ocr = True
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        
        # Find tesseract executable
        tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
        print(f"Using tesseract at: {tesseract_path}")
        
        # Configure OCR options
        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)  # Using standard options instead of CLI
        pipeline_options.ocr_options = ocr_options
        
        # Set up format options based on file type
        format_options = {
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
        
        # Handle image files
        if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
            print(f"Processing as image file: {file_extension}")
            format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
        
        # Try full force OCR with standard options
        try:
            converter = DocumentConverter(format_options=format_options)
            result = converter.convert(input_doc)
            return result.document.export_to_markdown()
        except Exception as e:
            print(f"Error with standard OCR: {e}")
            print(f"Attempting fallback to tesseract_cli OCR...")
            return self.parse(file_path, ocr_method="tesseract_cli")


# Register the parser with the registry
ParserRegistry.register(DoclingParser)