Spaces:

Ansemin101
/

Markit

Sleeping

File size: 2,747 Bytes

from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import json
import pypdfium2 as pdfium

from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend


class PyPdfiumParser(DocumentParser):
    """Parser implementation using PyPdfium."""
    
    @classmethod
    def get_name(cls) -> str:
        return "PyPdfium"
    
    @classmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        return [
            {
                "id": "no_ocr",
                "name": "No OCR",
                "default_params": {}
            },
            {
                "id": "easyocr",
                "name": "EasyOCR",
                "default_params": {"languages": ["en"]}
            }
        ]
    
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """Parse a document using PyPdfium."""
        pipeline_options = PdfPipelineOptions()
        pipeline_options.do_table_structure = True
        pipeline_options.table_structure_options.do_cell_matching = True
        
        # Configure OCR based on the method
        if ocr_method == "easyocr":
            pipeline_options.do_ocr = True
            # Apply any custom parameters from kwargs
            if "languages" in kwargs:
                pipeline_options.ocr_options.lang = kwargs["languages"]
        else:
            pipeline_options.do_ocr = False
        
        # Create the converter
        converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options,
                    backend=PyPdfiumDocumentBackend
                )
            }
        )
        
        # Convert the document
        result = converter.convert(Path(file_path))
        doc = result.document
        
        # Return the content in the requested format
        output_format = kwargs.get("output_format", "markdown")
        if output_format.lower() == "json":
            return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
        elif output_format.lower() == "text":
            return doc.export_to_text()
        elif output_format.lower() == "document_tags":
            return doc.export_to_document_tokens()
        else:
            return doc.export_to_markdown()


# Register the parser with the registry
ParserRegistry.register(PyPdfiumParser)