File size: 3,603 Bytes
27722f3
 
eb2eaac
101c783
 
27722f3
a370b95
 
27722f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac7733c
 
 
 
7e9f871
ac7733c
27722f3
ac7733c
 
02fee92
ac7733c
 
 
 
7e9f871
ac7733c
 
 
 
 
 
101c783
 
 
 
 
 
 
7e9f871
101c783
7e9f871
101c783
17ff7d9
 
 
 
 
 
 
 
 
7e9f871
101c783
02fee92
17ff7d9
b44cf1a
dab47f5
b44cf1a
dab47f5
b44cf1a
 
 
17ff7d9
b44cf1a
 
7e9f871
 
b44cf1a
7e9f871
 
 
101c783
7e9f871
101c783
dab47f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from typing import Optional, Dict, Any, Union
from pathlib import Path
import threading
import logging
import time

from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry


class ParserFactory:
    """Factory for creating parser instances."""
    
    @classmethod
    def create_parser(cls, parser_name: str) -> Optional[DocumentParser]:
        """
        Create a parser instance.
        
        Args:
            parser_name: Name of the parser to create
            
        Returns:
            An instance of the requested parser or None if not found
        """
        parser_class = ParserRegistry.get_parser_class(parser_name)
        if not parser_class:
            return None
        return parser_class()
    
    @classmethod
    def parse_document(cls, 
                      file_path: Union[str, Path], 
                      parser_name: str, 
                      ocr_method_name: str,
                      output_format: str = "markdown",
                      cancellation_flag: Optional[threading.Event] = None,
                      **kwargs) -> str:
        """
        Parse a document using the specified parser and OCR method.
        
        Args:
            file_path: Path to the document
            parser_name: Name of the parser to use
            ocr_method_name: Display name of the OCR method to use
            output_format: Output format (markdown, json, text, document_tags)
            cancellation_flag: Optional flag to check for cancellation
            **kwargs: Additional parser-specific options
            
        Returns:
            str: The parsed content
        """
        # Helper function to check cancellation
        def check_cancellation():
            if cancellation_flag and cancellation_flag.is_set():
                logging.info("Cancellation detected in parser_factory")
                return True
            return False
            
        # Check for cancellation immediately
        if check_cancellation():
            return "Conversion cancelled."
            
        parser = cls.create_parser(parser_name)
        if not parser:
            raise ValueError(f"Unknown parser: {parser_name}")
        
        # Get the internal OCR method ID
        ocr_method_id = ParserRegistry.get_ocr_method_id(parser_name, ocr_method_name)
        if not ocr_method_id:
            raise ValueError(f"Unknown OCR method: {ocr_method_name} for parser {parser_name}")
        
        # Check for cancellation again before starting the parsing
        if check_cancellation():
            return "Conversion cancelled."
        
        # Add a function to check cancellation that parsers can call
        def should_check_cancellation():
            """Function that parsers can call to check if they should check cancellation"""
            # No need to sleep here - this just returns whether cancellation should be checked
            return True
            
        # Parse the document, passing the cancellation flag and helper functions
        kwargs['cancellation_flag'] = cancellation_flag
        kwargs['check_cancellation'] = check_cancellation
        kwargs['should_check_cancellation'] = should_check_cancellation
        kwargs['output_format'] = output_format
        
        # Parse the document
        result = parser.parse(file_path, ocr_method=ocr_method_id, **kwargs)
        
        # Check one more time after parsing completes
        if check_cancellation():
            return "Conversion cancelled."
            
        return result