File size: 1,974 Bytes
27722f3
 
a370b95
 
 
27722f3
 
a370b95
 
27722f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import subprocess
import tempfile
import os
import json

from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered


class MarkerParser(DocumentParser):
    """Parser implementation using Marker."""
    
    @classmethod
    def get_name(cls) -> str:
        return "Marker"
    
    @classmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        return [
            {
                "id": "no_ocr",
                "name": "No OCR",
                "default_params": {}
            },
            {
                "id": "force_ocr",
                "name": "Force OCR",
                "default_params": {}
            }
        ]
    
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """Parse a document using Marker."""
        force_ocr = ocr_method == "force_ocr"
        
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
            config={"force_ocr": force_ocr}
        )
        rendered = converter(str(file_path))
        content, _, _ = text_from_rendered(rendered)
        
        # Format the content based on the requested output format
        output_format = kwargs.get("output_format", "markdown")
        if output_format.lower() == "json":
            return json.dumps({"content": content}, ensure_ascii=False, indent=2)
        elif output_format.lower() == "text":
            return content.replace("#", "").replace("*", "").replace("_", "")
        elif output_format.lower() == "document_tags":
            return f"<doc>\n{content}\n</doc>"
        else:
            return content


# Register the parser with the registry
ParserRegistry.register(MarkerParser)