File size: 1,414 Bytes
dda982a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Any, Union


class DocumentParser(ABC):
    """Base interface for all document parsers in the system."""
    
    @abstractmethod
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """
        Parse a document and return its content.
        
        Args:
            file_path: Path to the document
            ocr_method: OCR method to use (if applicable)
            **kwargs: Additional parser-specific options
            
        Returns:
            str: The parsed content
        """
        pass
    
    @classmethod
    @abstractmethod
    def get_name(cls) -> str:
        """Return the display name of this parser"""
        pass
    
    @classmethod
    @abstractmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        """
        Return a list of supported OCR methods.
        
        Returns:
            List of dictionaries with keys:
                - id: Unique identifier for the OCR method
                - name: Display name for the OCR method
                - default_params: Default parameters for this OCR method
        """
        pass
    
    @classmethod
    def get_description(cls) -> str:
        """Return a description of this parser"""
        return f"{cls.get_name()} document parser"