Spaces:

Ansemin101
/

Markit

Sleeping

App Files Files Community

Markit / src /parsers /docling_parser.py

AnseMin

removing ocrmac option

8af1511 about 2 months ago

raw

history blame contribute delete

6.12 kB

	from pathlib import Path
	from typing import Dict, List, Optional, Any, Union
	import json
	import os
	import shutil

	from src.parsers.parser_interface import DocumentParser
	from src.parsers.parser_registry import ParserRegistry
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.base_models import InputFormat
	from docling.datamodel.pipeline_options import (
	AcceleratorDevice,
	AcceleratorOptions,
	PdfPipelineOptions,
	)
	from docling.models.tesseract_ocr_model import TesseractOcrOptions
	from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions


	class DoclingParser(DocumentParser):
	"""Parser implementation using Docling."""

	@classmethod
	def get_name(cls) -> str:
	return "Docling"

	@classmethod
	def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
	return [
	{
	"id": "no_ocr",
	"name": "No OCR",
	"default_params": {}
	},
	{
	"id": "easyocr",
	"name": "EasyOCR",
	"default_params": {"languages": ["en"]}
	},
	{
	"id": "easyocr_cpu",
	"name": "EasyOCR (CPU only)",
	"default_params": {"languages": ["en"], "use_gpu": False}
	},
	{
	"id": "tesseract",
	"name": "Tesseract",
	"default_params": {}
	},
	{
	"id": "tesseract_cli",
	"name": "Tesseract CLI",
	"default_params": {}
	},
	{
	"id": "full_force_ocr",
	"name": "Full Force OCR",
	"default_params": {}
	}
	]

	def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
	"""Parse a document using Docling."""
	# Special case for full force OCR
	if ocr_method == "full_force_ocr":
	return self._apply_full_force_ocr(file_path)

	# Regular Docling parsing
	pipeline_options = PdfPipelineOptions()
	pipeline_options.do_table_structure = True
	pipeline_options.table_structure_options.do_cell_matching = True

	# Configure OCR based on the method
	if ocr_method == "no_ocr":
	pipeline_options.do_ocr = False
	elif ocr_method == "easyocr":
	pipeline_options.do_ocr = True
	pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
	pipeline_options.accelerator_options = AcceleratorOptions(
	num_threads=4, device=AcceleratorDevice.AUTO
	)
	elif ocr_method == "easyocr_cpu":
	pipeline_options.do_ocr = True
	pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
	pipeline_options.ocr_options.use_gpu = False
	elif ocr_method == "tesseract":
	pipeline_options.do_ocr = True
	pipeline_options.ocr_options = TesseractOcrOptions()
	elif ocr_method == "tesseract_cli":
	pipeline_options.do_ocr = True
	pipeline_options.ocr_options = TesseractCliOcrOptions()

	# Create the converter
	converter = DocumentConverter(
	format_options={
	InputFormat.PDF: PdfFormatOption(
	pipeline_options=pipeline_options
	)
	}
	)

	# Convert the document
	result = converter.convert(Path(file_path))
	doc = result.document

	# Return the content in the requested format
	output_format = kwargs.get("output_format", "markdown")
	if output_format.lower() == "json":
	return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
	elif output_format.lower() == "text":
	return doc.export_to_text()
	elif output_format.lower() == "document_tags":
	return doc.export_to_document_tokens()
	else:
	return doc.export_to_markdown()

	def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
	"""Apply full force OCR to a document."""
	input_doc = Path(file_path)
	file_extension = input_doc.suffix.lower()

	# Debug information
	print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")

	# Basic pipeline setup
	pipeline_options = PdfPipelineOptions()
	pipeline_options.do_ocr = True
	pipeline_options.do_table_structure = True
	pipeline_options.table_structure_options.do_cell_matching = True

	# Find tesseract executable
	tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
	print(f"Using tesseract at: {tesseract_path}")

	# Configure OCR options
	ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) # Using standard options instead of CLI
	pipeline_options.ocr_options = ocr_options

	# Set up format options based on file type
	format_options = {
	InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
	}

	# Handle image files
	if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
	print(f"Processing as image file: {file_extension}")
	format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)

	# Try full force OCR with standard options
	try:
	converter = DocumentConverter(format_options=format_options)
	result = converter.convert(input_doc)
	return result.document.export_to_markdown()
	except Exception as e:
	print(f"Error with standard OCR: {e}")
	print(f"Attempting fallback to tesseract_cli OCR...")
	return self.parse(file_path, ocr_method="tesseract_cli")


	# Register the parser with the registry
	ParserRegistry.register(DoclingParser)