docker_mineru / pdf_converter /convert_pdf_to_md.py
marcosremar2's picture
Update PDF to Markdown converter API with NVIDIA L4 support
3c3eb16
import marker
import os
import sys
import gc
import torch
from marker.config.parser import ConfigParser
from marker.models import create_model_dict
# Global variable to hold the pre-loaded converter
_converter = None
def initialize_converter():
"""Initializes the marker converter models and stores it globally."""
global _converter
if _converter is None:
print("Initializing marker models...")
try:
# Clear any existing CUDA cache before loading models
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
# Set custom font path from environment variable if available
font_path = os.environ.get('MARKER_FONT_PATH')
if font_path:
try:
# Import marker settings and override font path
from marker import settings
os.makedirs(font_path, exist_ok=True)
custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf')
settings.FONT_PATH = custom_font_path
print(f"Using custom font path: {custom_font_path}")
except ImportError:
print("Could not import marker settings, using default font path")
except Exception as e:
print(f"Error setting custom font path: {e}", file=sys.stderr)
# Create configuration, explicitly setting output format and batch multiplier
config_parser = ConfigParser({
'output_format': 'markdown',
'batch_multiplier': 4, # Increased from default 2
# Add any device-specific configuration here
'device': 'cuda' if torch.cuda.is_available() else 'cpu'
})
# Load models with explicit device mapping
models = create_model_dict()
# Get converter class and create converter
converter_cls = config_parser.get_converter_cls()
_converter = converter_cls(
config=config_parser.generate_config_dict(),
artifact_dict=models,
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
llm_service=config_parser.get_llm_service()
)
# Force another garbage collection after model load
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved")
print("Marker models initialized successfully with batch_multiplier=4.")
except Exception as e:
print(f"Failed to initialize marker models: {e}", file=sys.stderr)
_converter = None # Ensure it's None if init fails
# Attempt to clean up GPU memory in case of initialization failure
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
raise
else:
print("Marker models already initialized.")
def convert_pdf(pdf_input_path, output_md_path=None):
"""
Convert PDF file to Markdown using the pre-loaded marker converter.
Args:
pdf_input_path (str): Path to the input PDF file
output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.
Returns:
str: The markdown text
"""
# Check if the input PDF exists
if not os.path.exists(pdf_input_path):
raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")
# Check if converter is initialized
if _converter is None:
raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.")
print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")
try:
# Free up any temporary memory before conversion
if torch.cuda.is_available():
torch.cuda.empty_cache()
# Convert the PDF to markdown using the pre-loaded converter
result = _converter(pdf_input_path)
# Access the markdown content directly from the result object
markdown_text = result.markdown
# If output path is provided, save the markdown
if output_md_path:
output_dir = os.path.dirname(output_md_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
with open(output_md_path, "w", encoding="utf-8") as f:
f.write(markdown_text)
print(f"Successfully saved markdown to '{output_md_path}'")
# Clean up temporary GPU memory after conversion
if torch.cuda.is_available():
torch.cuda.empty_cache()
return markdown_text
except Exception as e:
print(f"An error occurred during conversion: {e}", file=sys.stderr)
print(f"Error details: {str(type(e))}", file=sys.stderr)
# Try to clean up GPU memory on error
if torch.cuda.is_available():
torch.cuda.empty_cache()
raise