Spaces:
Sleeping
Sleeping
File size: 1,974 Bytes
27722f3 a370b95 27722f3 a370b95 27722f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import subprocess
import tempfile
import os
import json
from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
class MarkerParser(DocumentParser):
"""Parser implementation using Marker."""
@classmethod
def get_name(cls) -> str:
return "Marker"
@classmethod
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
return [
{
"id": "no_ocr",
"name": "No OCR",
"default_params": {}
},
{
"id": "force_ocr",
"name": "Force OCR",
"default_params": {}
}
]
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
"""Parse a document using Marker."""
force_ocr = ocr_method == "force_ocr"
converter = PdfConverter(
artifact_dict=create_model_dict(),
config={"force_ocr": force_ocr}
)
rendered = converter(str(file_path))
content, _, _ = text_from_rendered(rendered)
# Format the content based on the requested output format
output_format = kwargs.get("output_format", "markdown")
if output_format.lower() == "json":
return json.dumps({"content": content}, ensure_ascii=False, indent=2)
elif output_format.lower() == "text":
return content.replace("#", "").replace("*", "").replace("_", "")
elif output_format.lower() == "document_tags":
return f"<doc>\n{content}\n</doc>"
else:
return content
# Register the parser with the registry
ParserRegistry.register(MarkerParser) |