Spaces:
Sleeping
Sleeping
File size: 6,124 Bytes
27722f3 67baccc 27722f3 a370b95 27722f3 67baccc 27722f3 67baccc f1d63ad 27722f3 f1d63ad 27722f3 f1d63ad 7161f9e f1d63ad 67baccc f1d63ad 27722f3 99c8f7d 67baccc f1d63ad 67baccc f1d63ad 67baccc f1d63ad 67baccc f1d63ad 67baccc f1d63ad 27722f3 3860890 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import json
import os
import shutil
from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
)
from docling.models.tesseract_ocr_model import TesseractOcrOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
class DoclingParser(DocumentParser):
"""Parser implementation using Docling."""
@classmethod
def get_name(cls) -> str:
return "Docling"
@classmethod
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
return [
{
"id": "no_ocr",
"name": "No OCR",
"default_params": {}
},
{
"id": "easyocr",
"name": "EasyOCR",
"default_params": {"languages": ["en"]}
},
{
"id": "easyocr_cpu",
"name": "EasyOCR (CPU only)",
"default_params": {"languages": ["en"], "use_gpu": False}
},
{
"id": "tesseract",
"name": "Tesseract",
"default_params": {}
},
{
"id": "tesseract_cli",
"name": "Tesseract CLI",
"default_params": {}
},
{
"id": "full_force_ocr",
"name": "Full Force OCR",
"default_params": {}
}
]
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
"""Parse a document using Docling."""
# Special case for full force OCR
if ocr_method == "full_force_ocr":
return self._apply_full_force_ocr(file_path)
# Regular Docling parsing
pipeline_options = PdfPipelineOptions()
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Configure OCR based on the method
if ocr_method == "no_ocr":
pipeline_options.do_ocr = False
elif ocr_method == "easyocr":
pipeline_options.do_ocr = True
pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=4, device=AcceleratorDevice.AUTO
)
elif ocr_method == "easyocr_cpu":
pipeline_options.do_ocr = True
pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
pipeline_options.ocr_options.use_gpu = False
elif ocr_method == "tesseract":
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractOcrOptions()
elif ocr_method == "tesseract_cli":
pipeline_options.do_ocr = True
pipeline_options.ocr_options = TesseractCliOcrOptions()
# Create the converter
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
)
}
)
# Convert the document
result = converter.convert(Path(file_path))
doc = result.document
# Return the content in the requested format
output_format = kwargs.get("output_format", "markdown")
if output_format.lower() == "json":
return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
elif output_format.lower() == "text":
return doc.export_to_text()
elif output_format.lower() == "document_tags":
return doc.export_to_document_tokens()
else:
return doc.export_to_markdown()
def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
"""Apply full force OCR to a document."""
input_doc = Path(file_path)
file_extension = input_doc.suffix.lower()
# Debug information
print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
# Basic pipeline setup
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Find tesseract executable
tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
print(f"Using tesseract at: {tesseract_path}")
# Configure OCR options
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) # Using standard options instead of CLI
pipeline_options.ocr_options = ocr_options
# Set up format options based on file type
format_options = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
# Handle image files
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
print(f"Processing as image file: {file_extension}")
format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
# Try full force OCR with standard options
try:
converter = DocumentConverter(format_options=format_options)
result = converter.convert(input_doc)
return result.document.export_to_markdown()
except Exception as e:
print(f"Error with standard OCR: {e}")
print(f"Attempting fallback to tesseract_cli OCR...")
return self.parse(file_path, ocr_method="tesseract_cli")
# Register the parser with the registry
ParserRegistry.register(DoclingParser) |