Spaces:

marcosremar2
/

docker_mineru

Sleeping

App Files Files Community

docker_mineru / pdf_converter /convert_pdf_to_md.py

marcosremar2

Update PDF to Markdown converter API with NVIDIA L4 support

3c3eb16 10 days ago

raw

history blame contribute delete

5.58 kB

	import marker
	import os
	import sys
	import gc
	import torch
	from marker.config.parser import ConfigParser
	from marker.models import create_model_dict

	# Global variable to hold the pre-loaded converter
	_converter = None

	def initialize_converter():
	"""Initializes the marker converter models and stores it globally."""
	global _converter
	if _converter is None:
	print("Initializing marker models...")
	try:
	# Clear any existing CUDA cache before loading models
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()
	print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/10242:.2f} MB allocated, {torch.cuda.memory_reserved()/10242:.2f} MB reserved")

	# Set custom font path from environment variable if available
	font_path = os.environ.get('MARKER_FONT_PATH')
	if font_path:
	try:
	# Import marker settings and override font path
	from marker import settings
	os.makedirs(font_path, exist_ok=True)
	custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf')
	settings.FONT_PATH = custom_font_path
	print(f"Using custom font path: {custom_font_path}")
	except ImportError:
	print("Could not import marker settings, using default font path")
	except Exception as e:
	print(f"Error setting custom font path: {e}", file=sys.stderr)

	# Create configuration, explicitly setting output format and batch multiplier
	config_parser = ConfigParser({
	'output_format': 'markdown',
	'batch_multiplier': 4, # Increased from default 2
	# Add any device-specific configuration here
	'device': 'cuda' if torch.cuda.is_available() else 'cpu'
	})

	# Load models with explicit device mapping
	models = create_model_dict()

	# Get converter class and create converter
	converter_cls = config_parser.get_converter_cls()
	_converter = converter_cls(
	config=config_parser.generate_config_dict(),
	artifact_dict=models,
	processor_list=config_parser.get_processors(),
	renderer=config_parser.get_renderer(),
	llm_service=config_parser.get_llm_service()
	)

	# Force another garbage collection after model load
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()
	print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/10242:.2f} MB allocated, {torch.cuda.memory_reserved()/10242:.2f} MB reserved")

	print("Marker models initialized successfully with batch_multiplier=4.")
	except Exception as e:
	print(f"Failed to initialize marker models: {e}", file=sys.stderr)
	_converter = None # Ensure it's None if init fails
	# Attempt to clean up GPU memory in case of initialization failure
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	gc.collect()
	raise
	else:
	print("Marker models already initialized.")

	def convert_pdf(pdf_input_path, output_md_path=None):
	"""
	Convert PDF file to Markdown using the pre-loaded marker converter.

	Args:
	pdf_input_path (str): Path to the input PDF file
	output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned.

	Returns:
	str: The markdown text
	"""
	# Check if the input PDF exists
	if not os.path.exists(pdf_input_path):
	raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'")

	# Check if converter is initialized
	if _converter is None:
	raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.")

	print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...")

	try:
	# Free up any temporary memory before conversion
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Convert the PDF to markdown using the pre-loaded converter
	result = _converter(pdf_input_path)

	# Access the markdown content directly from the result object
	markdown_text = result.markdown

	# If output path is provided, save the markdown
	if output_md_path:
	output_dir = os.path.dirname(output_md_path)
	if output_dir and not os.path.exists(output_dir):
	os.makedirs(output_dir, exist_ok=True)

	with open(output_md_path, "w", encoding="utf-8") as f:
	f.write(markdown_text)
	print(f"Successfully saved markdown to '{output_md_path}'")

	# Clean up temporary GPU memory after conversion
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	return markdown_text

	except Exception as e:
	print(f"An error occurred during conversion: {e}", file=sys.stderr)
	print(f"Error details: {str(type(e))}", file=sys.stderr)
	# Try to clean up GPU memory on error
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	raise