import gradio as gr import markdown import threading import time import logging from pathlib import Path from src.core.converter import convert_file, set_cancellation_flag, is_conversion_in_progress from src.parsers.parser_registry import ParserRegistry # Import MarkItDown to check if it's available try: from markitdown import MarkItDown HAS_MARKITDOWN = True logging.info("MarkItDown is available for use") except ImportError: HAS_MARKITDOWN = False logging.warning("MarkItDown is not available") # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Add a global variable to track cancellation state conversion_cancelled = threading.Event() # Pass the cancellation flag to the converter module set_cancellation_flag(conversion_cancelled) # Add a background thread to monitor cancellation def monitor_cancellation(): """Background thread to monitor cancellation and update UI if needed""" logger.info("Starting cancellation monitor thread") while is_conversion_in_progress(): if conversion_cancelled.is_set(): logger.info("Cancellation detected by monitor thread") time.sleep(0.1) # Check every 100ms logger.info("Cancellation monitor thread ending") def validate_file_for_parser(file_path, parser_name): """Validate if the file type is supported by the selected parser.""" if not file_path: return True, "" # No file selected yet if "GOT-OCR" in parser_name: file_ext = Path(file_path).suffix.lower() if file_ext not in ['.jpg', '.jpeg', '.png']: return False, "GOT-OCR only supports JPG and PNG formats." return True, "" def format_markdown_content(content): if not content: return content # Convert the content to HTML using markdown library html_content = markdown.markdown(str(content), extensions=['tables']) return html_content # Function to run conversion in a separate thread def run_conversion_thread(file_path, parser_name, ocr_method_name, output_format): """Run the conversion in a separate thread and return the thread object""" global conversion_cancelled # Reset the cancellation flag conversion_cancelled.clear() # Create a container for the results results = {"content": None, "download_file": None, "error": None} def conversion_worker(): try: content, download_file = convert_file(file_path, parser_name, ocr_method_name, output_format) results["content"] = content results["download_file"] = download_file except Exception as e: logger.error(f"Error during conversion: {str(e)}") results["error"] = str(e) # Create and start the thread thread = threading.Thread(target=conversion_worker) thread.daemon = True thread.start() return thread, results def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_cancelled): """Handle file conversion.""" global conversion_cancelled # Check if we should cancel before starting if is_cancelled: logger.info("Conversion cancelled before starting") return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) # Validate file type for the selected parser is_valid, error_msg = validate_file_for_parser(file_path, parser_name) if not is_valid: logger.error(f"File validation error: {error_msg}") return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) logger.info("Starting conversion with cancellation flag cleared") # Start the conversion in a separate thread thread, results = run_conversion_thread(file_path, parser_name, ocr_method_name, output_format) # Start the monitoring thread monitor_thread = threading.Thread(target=monitor_cancellation) monitor_thread.daemon = True monitor_thread.start() # Wait for the thread to complete or be cancelled while thread.is_alive(): # Check if cancellation was requested if conversion_cancelled.is_set(): logger.info("Cancellation detected, waiting for thread to finish") # Give the thread a chance to clean up thread.join(timeout=0.5) if thread.is_alive(): logger.warning("Thread did not finish within timeout") return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) # Sleep briefly to avoid busy waiting time.sleep(0.1) # Thread has completed, check results if results["error"]: return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) content = results["content"] download_file = results["download_file"] # If conversion returned a cancellation message if content == "Conversion cancelled.": logger.info("Converter returned cancellation message") return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) # Format the content and wrap it in the scrollable container formatted_content = format_markdown_content(str(content)) html_output = f"