Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,148 Bytes
dda982a b3a5734 dda982a 5b7f920 dda982a 5b7f920 dda982a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
import tempfile
import logging
import time
import os
from pathlib import Path
# Use relative imports instead of absolute imports
from src.core.parser_factory import ParserFactory
# Import all parsers to ensure they're registered
from src import parsers
# Import the LaTeX to Markdown converter
try:
from src.core.latex_to_markdown_converter import convert_latex_to_markdown
HAS_GEMINI_CONVERTER = True
except ImportError:
HAS_GEMINI_CONVERTER = False
logging.warning("LaTeX to Markdown converter not available. Raw LaTeX will be returned for formatted text.")
# Reference to the cancellation flag from ui.py
# This will be set by the UI when the cancel button is clicked
conversion_cancelled = None # Will be a threading.Event object
# Flag to track if conversion is currently in progress
_conversion_in_progress = False
def set_cancellation_flag(flag):
"""Set the reference to the cancellation flag from ui.py"""
global conversion_cancelled
conversion_cancelled = flag
def is_conversion_in_progress():
"""Check if conversion is currently in progress"""
global _conversion_in_progress
return _conversion_in_progress
def check_cancellation():
"""Check if cancellation has been requested"""
if conversion_cancelled and conversion_cancelled.is_set():
logging.info("Cancellation detected in check_cancellation")
return True
return False
def safe_delete_file(file_path):
"""Safely delete a file with error handling"""
if file_path and os.path.exists(file_path):
try:
os.unlink(file_path)
except Exception as e:
logging.error(f"Error cleaning up temp file {file_path}: {e}")
def convert_file(file_path, parser_name, ocr_method_name, output_format):
"""
Convert a file using the specified parser and OCR method.
Args:
file_path: Path to the file
parser_name: Name of the parser to use
ocr_method_name: Name of the OCR method to use
output_format: Output format (Markdown, JSON, Text, Document Tags)
Returns:
tuple: (content, download_file_path)
"""
global conversion_cancelled, _conversion_in_progress
# Set the conversion in progress flag
_conversion_in_progress = True
# Temporary file paths to clean up
temp_input = None
tmp_path = None
# Ensure we clean up the flag when we're done
try:
if not file_path:
return "Please upload a file.", None
# Check for cancellation
if check_cancellation():
logging.info("Cancellation detected at start of convert_file")
return "Conversion cancelled.", None
# Create a temporary file with English filename
try:
original_ext = Path(file_path).suffix
with tempfile.NamedTemporaryFile(suffix=original_ext, delete=False) as temp_file:
temp_input = temp_file.name
# Copy the content of original file to temp file
with open(file_path, 'rb') as original:
# Read in smaller chunks and check for cancellation between chunks
chunk_size = 1024 * 1024 # 1MB chunks
while True:
# Check for cancellation frequently
if check_cancellation():
logging.info("Cancellation detected during file copy")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
chunk = original.read(chunk_size)
if not chunk:
break
temp_file.write(chunk)
file_path = temp_input
except Exception as e:
safe_delete_file(temp_input)
return f"Error creating temporary file: {e}", None
# Check for cancellation again
if check_cancellation():
logging.info("Cancellation detected after file preparation")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
content = None
try:
# Use the parser factory to parse the document
start = time.time()
# Pass the cancellation flag to the parser factory
content = ParserFactory.parse_document(
file_path=file_path,
parser_name=parser_name,
ocr_method_name=ocr_method_name,
output_format=output_format.lower(),
cancellation_flag=conversion_cancelled # Pass the flag to parsers
)
# If content indicates cancellation, return early
if content == "Conversion cancelled.":
logging.info("Parser reported cancellation")
safe_delete_file(temp_input)
return content, None
duration = time.time() - start
logging.info(f"Processed in {duration:.2f} seconds.")
# Check for cancellation after processing
if check_cancellation():
logging.info("Cancellation detected after processing")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
# Process LaTeX content for GOT-OCR formatted text
if parser_name == "GOT-OCR (jpg,png only)" and ocr_method_name == "Formatted Text" and HAS_GEMINI_CONVERTER:
logging.info("Converting LaTeX output to Markdown using Gemini API")
start_convert = time.time()
# Check for cancellation before conversion
if check_cancellation():
logging.info("Cancellation detected before LaTeX conversion")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
try:
markdown_content = convert_latex_to_markdown(content)
if markdown_content:
content = markdown_content
logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
else:
logging.warning("LaTeX to Markdown conversion failed, using raw LaTeX output")
except Exception as e:
logging.error(f"Error converting LaTeX to Markdown: {str(e)}")
# Continue with the original content on error
# Check for cancellation after conversion
if check_cancellation():
logging.info("Cancellation detected after LaTeX conversion")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
except Exception as e:
safe_delete_file(temp_input)
return f"Error: {e}", None
# Determine the file extension based on the output format
if output_format == "Markdown":
ext = ".md"
elif output_format == "JSON":
ext = ".json"
elif output_format == "Text":
ext = ".txt"
elif output_format == "Document Tags":
ext = ".doctags"
else:
ext = ".txt"
# Check for cancellation again
if check_cancellation():
logging.info("Cancellation detected before output file creation")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
try:
# Create a temporary file for download
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
tmp_path = tmp.name
# Write in chunks and check for cancellation
chunk_size = 10000 # characters
for i in range(0, len(content), chunk_size):
# Check for cancellation
if check_cancellation():
logging.info("Cancellation detected during output file writing")
safe_delete_file(tmp_path)
safe_delete_file(temp_input)
return "Conversion cancelled.", None
tmp.write(content[i:i+chunk_size])
# Clean up the temporary input file
safe_delete_file(temp_input)
temp_input = None # Mark as cleaned up
return content, tmp_path
except Exception as e:
safe_delete_file(tmp_path)
safe_delete_file(temp_input)
return f"Error: {e}", None
finally:
# Always clean up any remaining temp files
safe_delete_file(temp_input)
if check_cancellation() and tmp_path:
safe_delete_file(tmp_path)
# Always clear the conversion in progress flag when done
_conversion_in_progress = False
|