Spaces:
Sleeping
Sleeping
File size: 7,243 Bytes
27722f3 a370b95 27722f3 66e859e 101c783 b44cf1a eb2eaac 66e859e 7b03bb1 101c783 b44cf1a 101c783 7b03bb1 dab47f5 27722f3 b44cf1a 66e859e b44cf1a dab47f5 b44cf1a 27722f3 b44cf1a eb2eaac b44cf1a 17ff7d9 b44cf1a dab47f5 b44cf1a dab47f5 b44cf1a 17ff7d9 b44cf1a 101c783 b44cf1a dab47f5 eb2eaac 7b03bb1 b44cf1a 17ff7d9 b44cf1a dab47f5 b44cf1a 7e9f871 b44cf1a ac7733c b44cf1a dab47f5 b44cf1a dab47f5 b44cf1a dab47f5 b44cf1a 17ff7d9 b44cf1a dab47f5 b44cf1a dab47f5 b44cf1a dab47f5 b44cf1a dab47f5 b44cf1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import tempfile
import logging
import time
import os
from pathlib import Path
# Use relative imports instead of absolute imports
from src.core.parser_factory import ParserFactory
# Import all parsers to ensure they're registered
import parsers
# Reference to the cancellation flag from ui.py
# This will be set by the UI when the cancel button is clicked
conversion_cancelled = None # Will be a threading.Event object
# Flag to track if conversion is currently in progress
_conversion_in_progress = False
def set_cancellation_flag(flag):
"""Set the reference to the cancellation flag from ui.py"""
global conversion_cancelled
conversion_cancelled = flag
def is_conversion_in_progress():
"""Check if conversion is currently in progress"""
global _conversion_in_progress
return _conversion_in_progress
def check_cancellation():
"""Check if cancellation has been requested"""
if conversion_cancelled and conversion_cancelled.is_set():
logging.info("Cancellation detected in check_cancellation")
return True
return False
def safe_delete_file(file_path):
"""Safely delete a file with error handling"""
if file_path and os.path.exists(file_path):
try:
os.unlink(file_path)
except Exception as e:
logging.error(f"Error cleaning up temp file {file_path}: {e}")
def convert_file(file_path, parser_name, ocr_method_name, output_format):
"""
Convert a file using the specified parser and OCR method.
Args:
file_path: Path to the file
parser_name: Name of the parser to use
ocr_method_name: Name of the OCR method to use
output_format: Output format (Markdown, JSON, Text, Document Tags)
Returns:
tuple: (content, download_file_path)
"""
global conversion_cancelled, _conversion_in_progress
# Set the conversion in progress flag
_conversion_in_progress = True
# Temporary file paths to clean up
temp_input = None
tmp_path = None
# Ensure we clean up the flag when we're done
try:
if not file_path:
return "Please upload a file.", None
# Check for cancellation
if check_cancellation():
logging.info("Cancellation detected at start of convert_file")
return "Conversion cancelled.", None
# Create a temporary file with English filename
try:
original_ext = Path(file_path).suffix
with tempfile.NamedTemporaryFile(suffix=original_ext, delete=False) as temp_file:
temp_input = temp_file.name
# Copy the content of original file to temp file
with open(file_path, 'rb') as original:
# Read in smaller chunks and check for cancellation between chunks
chunk_size = 1024 * 1024 # 1MB chunks
while True:
# Check for cancellation frequently
if check_cancellation():
logging.info("Cancellation detected during file copy")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
chunk = original.read(chunk_size)
if not chunk:
break
temp_file.write(chunk)
file_path = temp_input
except Exception as e:
safe_delete_file(temp_input)
return f"Error creating temporary file: {e}", None
# Check for cancellation again
if check_cancellation():
logging.info("Cancellation detected after file preparation")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
content = None
try:
# Use the parser factory to parse the document
start = time.time()
# Pass the cancellation flag to the parser factory
content = ParserFactory.parse_document(
file_path=file_path,
parser_name=parser_name,
ocr_method_name=ocr_method_name,
output_format=output_format.lower(),
cancellation_flag=conversion_cancelled # Pass the flag to parsers
)
# If content indicates cancellation, return early
if content == "Conversion cancelled.":
logging.info("Parser reported cancellation")
safe_delete_file(temp_input)
return content, None
duration = time.time() - start
logging.info(f"Processed in {duration:.2f} seconds.")
# Check for cancellation after processing
if check_cancellation():
logging.info("Cancellation detected after processing")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
except Exception as e:
safe_delete_file(temp_input)
return f"Error: {e}", None
# Determine the file extension based on the output format
if output_format == "Markdown":
ext = ".md"
elif output_format == "JSON":
ext = ".json"
elif output_format == "Text":
ext = ".txt"
elif output_format == "Document Tags":
ext = ".doctags"
else:
ext = ".txt"
# Check for cancellation again
if check_cancellation():
logging.info("Cancellation detected before output file creation")
safe_delete_file(temp_input)
return "Conversion cancelled.", None
try:
# Create a temporary file for download
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
tmp_path = tmp.name
# Write in chunks and check for cancellation
chunk_size = 10000 # characters
for i in range(0, len(content), chunk_size):
# Check for cancellation
if check_cancellation():
logging.info("Cancellation detected during output file writing")
safe_delete_file(tmp_path)
safe_delete_file(temp_input)
return "Conversion cancelled.", None
tmp.write(content[i:i+chunk_size])
# Clean up the temporary input file
safe_delete_file(temp_input)
temp_input = None # Mark as cleaned up
return content, tmp_path
except Exception as e:
safe_delete_file(tmp_path)
safe_delete_file(temp_input)
return f"Error: {e}", None
finally:
# Always clean up any remaining temp files
safe_delete_file(temp_input)
if check_cancellation() and tmp_path:
safe_delete_file(tmp_path)
# Always clear the conversion in progress flag when done
_conversion_in_progress = False
|