Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,073 Bytes
dbdd7c8 55627c9 dbdd7c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import logging
import os
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
import io
# Import the parser interface and registry
from src.parsers.parser_interface import DocumentParser
from src.parsers.parser_registry import ParserRegistry
# Check for MarkItDown availability
try:
from markitdown import MarkItDown
from openai import OpenAI
HAS_MARKITDOWN = True
except ImportError:
HAS_MARKITDOWN = False
logging.warning("MarkItDown package not installed. Please install with 'pip install markitdown[all]'")
# Configure logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
class MarkItDownParser(DocumentParser):
"""
Parser implementation using MarkItDown for converting various file formats to Markdown.
"""
def __init__(self):
self.markdown_instance = None
# Initialize MarkItDown instance
if HAS_MARKITDOWN:
try:
# Check for OpenAI API key for LLM-based image descriptions
openai_api_key = os.getenv("OPENAI_API_KEY")
if openai_api_key:
client = OpenAI()
self.markdown_instance = MarkItDown(
enable_plugins=False,
llm_client=client,
llm_model="gpt-4o"
)
logger.info("MarkItDown initialized with OpenAI support for image descriptions")
else:
self.markdown_instance = MarkItDown(enable_plugins=False)
logger.info("MarkItDown initialized without OpenAI support")
except Exception as e:
logger.error(f"Error initializing MarkItDown: {str(e)}")
self.markdown_instance = None
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
"""
Parse a document and return its content as Markdown.
Args:
file_path: Path to the document
ocr_method: OCR method to use (not used in this parser)
**kwargs: Additional options including cancellation checking
Returns:
str: Markdown representation of the document
"""
# Check if MarkItDown is available
if not HAS_MARKITDOWN or self.markdown_instance is None:
return "Error: MarkItDown is not available. Please install with 'pip install markitdown[all]'"
# Get cancellation check function from kwargs
check_cancellation = kwargs.get('check_cancellation', lambda: False)
# Check for cancellation before starting
if check_cancellation():
return "Conversion cancelled."
try:
# Convert the file using the standard instance
result = self.markdown_instance.convert(file_path)
# Check for cancellation after processing
if check_cancellation():
return "Conversion cancelled."
return result.text_content
except Exception as e:
logger.error(f"Error converting file with MarkItDown: {str(e)}")
return f"Error: {str(e)}"
@classmethod
def get_name(cls) -> str:
return "MarkItDown (pdf, jpg, png, xlsx --best for xlsx)"
@classmethod
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
return [
{
"id": "standard",
"name": "Standard Conversion",
"default_params": {}
}
]
@classmethod
def get_description(cls) -> str:
return "MarkItDown parser for converting various file formats to Markdown"
# Register the parser with the registry if available
if HAS_MARKITDOWN:
ParserRegistry.register(MarkItDownParser)
logger.info("MarkItDown parser registered successfully")
else:
logger.warning("Could not register MarkItDown parser: Package not installed") |