Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,603 Bytes
dda982a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from typing import Optional, Dict, Any, Union
from pathlib import Path
import threading
import logging
import time
from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
class ParserFactory:
"""Factory for creating parser instances."""
@classmethod
def create_parser(cls, parser_name: str) -> Optional[DocumentParser]:
"""
Create a parser instance.
Args:
parser_name: Name of the parser to create
Returns:
An instance of the requested parser or None if not found
"""
parser_class = ParserRegistry.get_parser_class(parser_name)
if not parser_class:
return None
return parser_class()
@classmethod
def parse_document(cls,
file_path: Union[str, Path],
parser_name: str,
ocr_method_name: str,
output_format: str = "markdown",
cancellation_flag: Optional[threading.Event] = None,
**kwargs) -> str:
"""
Parse a document using the specified parser and OCR method.
Args:
file_path: Path to the document
parser_name: Name of the parser to use
ocr_method_name: Display name of the OCR method to use
output_format: Output format (markdown, json, text, document_tags)
cancellation_flag: Optional flag to check for cancellation
**kwargs: Additional parser-specific options
Returns:
str: The parsed content
"""
# Helper function to check cancellation
def check_cancellation():
if cancellation_flag and cancellation_flag.is_set():
logging.info("Cancellation detected in parser_factory")
return True
return False
# Check for cancellation immediately
if check_cancellation():
return "Conversion cancelled."
parser = cls.create_parser(parser_name)
if not parser:
raise ValueError(f"Unknown parser: {parser_name}")
# Get the internal OCR method ID
ocr_method_id = ParserRegistry.get_ocr_method_id(parser_name, ocr_method_name)
if not ocr_method_id:
raise ValueError(f"Unknown OCR method: {ocr_method_name} for parser {parser_name}")
# Check for cancellation again before starting the parsing
if check_cancellation():
return "Conversion cancelled."
# Add a function to check cancellation that parsers can call
def should_check_cancellation():
"""Function that parsers can call to check if they should check cancellation"""
# No need to sleep here - this just returns whether cancellation should be checked
return True
# Parse the document, passing the cancellation flag and helper functions
kwargs['cancellation_flag'] = cancellation_flag
kwargs['check_cancellation'] = check_cancellation
kwargs['should_check_cancellation'] = should_check_cancellation
kwargs['output_format'] = output_format
# Parse the document
result = parser.parse(file_path, ocr_method=ocr_method_id, **kwargs)
# Check one more time after parsing completes
if check_cancellation():
return "Conversion cancelled."
return result |