File size: 4,073 Bytes
dbdd7c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55627c9
dbdd7c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import logging
import os
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import io

# Import the parser interface and registry
from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry

# Check for MarkItDown availability
try:
    from markitdown import MarkItDown
    from openai import OpenAI
    HAS_MARKITDOWN = True
except ImportError:
    HAS_MARKITDOWN = False
    logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")

# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

class MarkItDownParser(DocumentParser):
    """
    Parser implementation using MarkItDown for converting various file formats to Markdown.
    """
    
    def __init__(self):
        self.markdown_instance = None
        # Initialize MarkItDown instance
        if HAS_MARKITDOWN:
            try:
                # Check for OpenAI API key for LLM-based image descriptions
                openai_api_key = os.getenv("OPENAI_API_KEY")
                if openai_api_key:
                    client = OpenAI()
                    self.markdown_instance = MarkItDown(
                        enable_plugins=False,
                        llm_client=client, 
                        llm_model="gpt-4o"
                    )
                    logger.info("MarkItDown initialized with OpenAI support for image descriptions")
                else:
                    self.markdown_instance = MarkItDown(enable_plugins=False)
                    logger.info("MarkItDown initialized without OpenAI support")
            except Exception as e:
                logger.error(f"Error initializing MarkItDown: {str(e)}")
                self.markdown_instance = None
    
    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
        """
        Parse a document and return its content as Markdown.
        
        Args:
            file_path: Path to the document
            ocr_method: OCR method to use (not used in this parser)
            **kwargs: Additional options including cancellation checking
        
        Returns:
            str: Markdown representation of the document
        """
        # Check if MarkItDown is available
        if not HAS_MARKITDOWN or self.markdown_instance is None:
            return "Error: MarkItDown is not available. Please install with 'pip install markitdown[all]'"
            
        # Get cancellation check function from kwargs
        check_cancellation = kwargs.get('check_cancellation', lambda: False)
        
        # Check for cancellation before starting
        if check_cancellation():
            return "Conversion cancelled."
            
        try:
            # Convert the file using the standard instance
            result = self.markdown_instance.convert(file_path)
                
            # Check for cancellation after processing
            if check_cancellation():
                return "Conversion cancelled."
                
            return result.text_content
        except Exception as e:
            logger.error(f"Error converting file with MarkItDown: {str(e)}")
            return f"Error: {str(e)}"
    
    @classmethod
    def get_name(cls) -> str:
        return "MarkItDown (pdf, jpg, png, xlsx --best for xlsx)"
    
    @classmethod
    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
        return [
            {
                "id": "standard",
                "name": "Standard Conversion",
                "default_params": {}
            }
        ]
    
    @classmethod
    def get_description(cls) -> str:
        return "MarkItDown parser for converting various file formats to Markdown"


# Register the parser with the registry if available
if HAS_MARKITDOWN:
    ParserRegistry.register(MarkItDownParser)
    logger.info("MarkItDown parser registered successfully")
else:
    logger.warning("Could not register MarkItDown parser: Package not installed")