Spaces:
Sleeping
Sleeping
import marker | |
import os | |
import sys | |
import gc | |
import torch | |
from marker.config.parser import ConfigParser | |
from marker.models import create_model_dict | |
# Global variable to hold the pre-loaded converter | |
_converter = None | |
def initialize_converter(): | |
"""Initializes the marker converter models and stores it globally.""" | |
global _converter | |
if _converter is None: | |
print("Initializing marker models...") | |
try: | |
# Clear any existing CUDA cache before loading models | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
gc.collect() | |
print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved") | |
# Set custom font path from environment variable if available | |
font_path = os.environ.get('MARKER_FONT_PATH') | |
if font_path: | |
try: | |
# Import marker settings and override font path | |
from marker import settings | |
os.makedirs(font_path, exist_ok=True) | |
custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf') | |
settings.FONT_PATH = custom_font_path | |
print(f"Using custom font path: {custom_font_path}") | |
except ImportError: | |
print("Could not import marker settings, using default font path") | |
except Exception as e: | |
print(f"Error setting custom font path: {e}", file=sys.stderr) | |
# Create configuration, explicitly setting output format and batch multiplier | |
config_parser = ConfigParser({ | |
'output_format': 'markdown', | |
'batch_multiplier': 4, # Increased from default 2 | |
# Add any device-specific configuration here | |
'device': 'cuda' if torch.cuda.is_available() else 'cpu' | |
}) | |
# Load models with explicit device mapping | |
models = create_model_dict() | |
# Get converter class and create converter | |
converter_cls = config_parser.get_converter_cls() | |
_converter = converter_cls( | |
config=config_parser.generate_config_dict(), | |
artifact_dict=models, | |
processor_list=config_parser.get_processors(), | |
renderer=config_parser.get_renderer(), | |
llm_service=config_parser.get_llm_service() | |
) | |
# Force another garbage collection after model load | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
gc.collect() | |
print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved") | |
print("Marker models initialized successfully with batch_multiplier=4.") | |
except Exception as e: | |
print(f"Failed to initialize marker models: {e}", file=sys.stderr) | |
_converter = None # Ensure it's None if init fails | |
# Attempt to clean up GPU memory in case of initialization failure | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
gc.collect() | |
raise | |
else: | |
print("Marker models already initialized.") | |
def convert_pdf(pdf_input_path, output_md_path=None): | |
""" | |
Convert PDF file to Markdown using the pre-loaded marker converter. | |
Args: | |
pdf_input_path (str): Path to the input PDF file | |
output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned. | |
Returns: | |
str: The markdown text | |
""" | |
# Check if the input PDF exists | |
if not os.path.exists(pdf_input_path): | |
raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'") | |
# Check if converter is initialized | |
if _converter is None: | |
raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.") | |
print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...") | |
try: | |
# Free up any temporary memory before conversion | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
# Convert the PDF to markdown using the pre-loaded converter | |
result = _converter(pdf_input_path) | |
# Access the markdown content directly from the result object | |
markdown_text = result.markdown | |
# If output path is provided, save the markdown | |
if output_md_path: | |
output_dir = os.path.dirname(output_md_path) | |
if output_dir and not os.path.exists(output_dir): | |
os.makedirs(output_dir, exist_ok=True) | |
with open(output_md_path, "w", encoding="utf-8") as f: | |
f.write(markdown_text) | |
print(f"Successfully saved markdown to '{output_md_path}'") | |
# Clean up temporary GPU memory after conversion | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
return markdown_text | |
except Exception as e: | |
print(f"An error occurred during conversion: {e}", file=sys.stderr) | |
print(f"Error details: {str(type(e))}", file=sys.stderr) | |
# Try to clean up GPU memory on error | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
raise |