Spaces:
Sleeping
Sleeping
cancel button fix 5
Browse files- src/converter.py +51 -14
- src/parser_factory.py +52 -21
src/converter.py
CHANGED
@@ -12,13 +12,13 @@ from parser_factory import ParserFactory
|
|
12 |
import parsers
|
13 |
|
14 |
# Reference to the cancellation flag from ui.py
|
15 |
-
# This will be set by the UI when the cancel button is clicked
|
16 |
conversion_cancelled = None
|
17 |
|
18 |
def set_cancellation_flag(flag):
|
19 |
"""Set the reference to the cancellation flag from ui.py"""
|
20 |
global conversion_cancelled
|
21 |
conversion_cancelled = flag
|
|
|
22 |
|
23 |
|
24 |
def convert_file(file_path, parser_name, ocr_method_name, output_format):
|
@@ -39,23 +39,35 @@ def convert_file(file_path, parser_name, ocr_method_name, output_format):
|
|
39 |
if not file_path:
|
40 |
return "Please upload a file.", None
|
41 |
|
|
|
|
|
|
|
|
|
42 |
# Create a temporary file with English filename
|
43 |
temp_input = None
|
44 |
try:
|
|
|
|
|
|
|
|
|
|
|
45 |
original_ext = Path(file_path).suffix
|
46 |
with tempfile.NamedTemporaryFile(suffix=original_ext, delete=False) as temp_input:
|
47 |
# Copy the content of original file to temp file
|
48 |
with open(file_path, 'rb') as original:
|
49 |
temp_input.write(original.read())
|
50 |
file_path = temp_input.name
|
51 |
-
|
52 |
-
#
|
53 |
if conversion_cancelled and conversion_cancelled.is_set():
|
|
|
54 |
cleanup_temp_file(temp_input.name)
|
55 |
return "Conversion cancelled.", None
|
56 |
|
57 |
# Use the parser factory to parse the document
|
58 |
start = time.time()
|
|
|
|
|
59 |
content = ParserFactory.parse_document(
|
60 |
file_path=file_path,
|
61 |
parser_name=parser_name,
|
@@ -64,28 +76,52 @@ def convert_file(file_path, parser_name, ocr_method_name, output_format):
|
|
64 |
cancellation_flag=conversion_cancelled
|
65 |
)
|
66 |
|
67 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
if conversion_cancelled and conversion_cancelled.is_set():
|
|
|
69 |
cleanup_temp_file(temp_input.name)
|
70 |
return "Conversion cancelled.", None
|
71 |
-
|
72 |
duration = time.time() - start
|
73 |
logging.info(f"Processed in {duration:.2f} seconds.")
|
74 |
|
75 |
-
#
|
76 |
-
|
|
|
|
|
|
|
77 |
|
78 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
|
80 |
tmp.write(content)
|
81 |
tmp_path = tmp.name
|
82 |
-
|
83 |
-
#
|
84 |
cleanup_temp_file(temp_input.name)
|
|
|
85 |
return content, tmp_path
|
86 |
-
|
87 |
except Exception as e:
|
88 |
-
|
|
|
89 |
cleanup_temp_file(temp_input.name)
|
90 |
return f"Error: {e}", None
|
91 |
|
@@ -94,8 +130,9 @@ def cleanup_temp_file(file_path):
|
|
94 |
"""Helper function to clean up temporary files"""
|
95 |
try:
|
96 |
os.unlink(file_path)
|
97 |
-
|
98 |
-
|
|
|
99 |
|
100 |
|
101 |
def get_output_extension(output_format):
|
|
|
12 |
import parsers
|
13 |
|
14 |
# Reference to the cancellation flag from ui.py
|
|
|
15 |
conversion_cancelled = None
|
16 |
|
17 |
def set_cancellation_flag(flag):
|
18 |
"""Set the reference to the cancellation flag from ui.py"""
|
19 |
global conversion_cancelled
|
20 |
conversion_cancelled = flag
|
21 |
+
logging.info(f"Cancellation flag set: {flag}")
|
22 |
|
23 |
|
24 |
def convert_file(file_path, parser_name, ocr_method_name, output_format):
|
|
|
39 |
if not file_path:
|
40 |
return "Please upload a file.", None
|
41 |
|
42 |
+
# Log cancellation state at the start
|
43 |
+
if conversion_cancelled:
|
44 |
+
logging.info(f"Starting conversion. Cancellation flag state: {conversion_cancelled.is_set()}")
|
45 |
+
|
46 |
# Create a temporary file with English filename
|
47 |
temp_input = None
|
48 |
try:
|
49 |
+
# Check for early cancellation
|
50 |
+
if conversion_cancelled and conversion_cancelled.is_set():
|
51 |
+
logging.info("Conversion cancelled before file preparation")
|
52 |
+
return "Conversion cancelled.", None
|
53 |
+
|
54 |
original_ext = Path(file_path).suffix
|
55 |
with tempfile.NamedTemporaryFile(suffix=original_ext, delete=False) as temp_input:
|
56 |
# Copy the content of original file to temp file
|
57 |
with open(file_path, 'rb') as original:
|
58 |
temp_input.write(original.read())
|
59 |
file_path = temp_input.name
|
60 |
+
|
61 |
+
# Check for cancellation after file preparation
|
62 |
if conversion_cancelled and conversion_cancelled.is_set():
|
63 |
+
logging.info("Conversion cancelled after file preparation")
|
64 |
cleanup_temp_file(temp_input.name)
|
65 |
return "Conversion cancelled.", None
|
66 |
|
67 |
# Use the parser factory to parse the document
|
68 |
start = time.time()
|
69 |
+
|
70 |
+
# Pass the cancellation flag to the parser factory
|
71 |
content = ParserFactory.parse_document(
|
72 |
file_path=file_path,
|
73 |
parser_name=parser_name,
|
|
|
76 |
cancellation_flag=conversion_cancelled
|
77 |
)
|
78 |
|
79 |
+
# Check if the content indicates cancellation
|
80 |
+
if content == "Conversion cancelled.":
|
81 |
+
logging.info("Parser reported cancellation")
|
82 |
+
cleanup_temp_file(temp_input.name)
|
83 |
+
return content, None
|
84 |
+
|
85 |
+
# Check for cancellation after parsing
|
86 |
if conversion_cancelled and conversion_cancelled.is_set():
|
87 |
+
logging.info("Conversion cancelled after parsing")
|
88 |
cleanup_temp_file(temp_input.name)
|
89 |
return "Conversion cancelled.", None
|
90 |
+
|
91 |
duration = time.time() - start
|
92 |
logging.info(f"Processed in {duration:.2f} seconds.")
|
93 |
|
94 |
+
# Check for cancellation before file creation
|
95 |
+
if conversion_cancelled and conversion_cancelled.is_set():
|
96 |
+
logging.info("Conversion cancelled before file creation")
|
97 |
+
cleanup_temp_file(temp_input.name)
|
98 |
+
return "Conversion cancelled.", None
|
99 |
|
100 |
+
# Determine the file extension based on the output format
|
101 |
+
if output_format == "Markdown":
|
102 |
+
ext = ".md"
|
103 |
+
elif output_format == "JSON":
|
104 |
+
ext = ".json"
|
105 |
+
elif output_format == "Text":
|
106 |
+
ext = ".txt"
|
107 |
+
elif output_format == "Document Tags":
|
108 |
+
ext = ".doctags"
|
109 |
+
else:
|
110 |
+
ext = ".txt"
|
111 |
+
|
112 |
+
# Create a temporary file for download
|
113 |
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
|
114 |
tmp.write(content)
|
115 |
tmp_path = tmp.name
|
116 |
+
|
117 |
+
# Clean up the temporary input file
|
118 |
cleanup_temp_file(temp_input.name)
|
119 |
+
|
120 |
return content, tmp_path
|
121 |
+
|
122 |
except Exception as e:
|
123 |
+
logging.error(f"Error during conversion: {str(e)}")
|
124 |
+
if temp_input and hasattr(temp_input, 'name'):
|
125 |
cleanup_temp_file(temp_input.name)
|
126 |
return f"Error: {e}", None
|
127 |
|
|
|
130 |
"""Helper function to clean up temporary files"""
|
131 |
try:
|
132 |
os.unlink(file_path)
|
133 |
+
logging.info(f"Cleaned up temporary file: {file_path}")
|
134 |
+
except Exception as e:
|
135 |
+
logging.error(f"Failed to clean up temporary file {file_path}: {str(e)}")
|
136 |
|
137 |
|
138 |
def get_output_extension(output_format):
|
src/parser_factory.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from typing import Optional, Dict, Any, Union
|
2 |
from pathlib import Path
|
3 |
import threading
|
|
|
4 |
|
5 |
from parser_interface import DocumentParser
|
6 |
from parser_registry import ParserRegistry
|
@@ -28,29 +29,59 @@ class ParserFactory:
|
|
28 |
return parser_class()
|
29 |
|
30 |
@classmethod
|
31 |
-
def parse_document(cls,
|
32 |
-
|
|
|
|
|
|
|
33 |
**kwargs) -> str:
|
34 |
-
"""
|
35 |
-
|
36 |
-
if cancellation_flag and cancellation_flag.is_set():
|
37 |
-
return "Conversion cancelled."
|
38 |
-
|
39 |
-
parser = cls.create_parser(parser_name)
|
40 |
-
if not parser:
|
41 |
-
raise ValueError(f"Unknown parser: {parser_name}")
|
42 |
-
|
43 |
-
# Get the internal OCR method ID
|
44 |
-
ocr_method_id = ParserRegistry.get_ocr_method_id(parser_name, ocr_method_name)
|
45 |
-
if not ocr_method_id:
|
46 |
-
raise ValueError(f"Unknown OCR method: {ocr_method_name} for parser {parser_name}")
|
47 |
-
|
48 |
-
# Parse the document, passing the cancellation flag
|
49 |
-
kwargs['cancellation_flag'] = cancellation_flag
|
50 |
-
result = parser.parse(file_path, ocr_method=ocr_method_id, **kwargs)
|
51 |
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
if cancellation_flag and cancellation_flag.is_set():
|
|
|
54 |
return "Conversion cancelled."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from typing import Optional, Dict, Any, Union
|
2 |
from pathlib import Path
|
3 |
import threading
|
4 |
+
import logging
|
5 |
|
6 |
from parser_interface import DocumentParser
|
7 |
from parser_registry import ParserRegistry
|
|
|
29 |
return parser_class()
|
30 |
|
31 |
@classmethod
|
32 |
+
def parse_document(cls,
|
33 |
+
file_path: Union[str, Path],
|
34 |
+
parser_name: str,
|
35 |
+
ocr_method_name: str,
|
36 |
+
cancellation_flag: Optional[threading.Event] = None,
|
37 |
**kwargs) -> str:
|
38 |
+
"""
|
39 |
+
Parse a document using the specified parser and OCR method.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
Args:
|
42 |
+
file_path: Path to the document
|
43 |
+
parser_name: Name of the parser to use
|
44 |
+
ocr_method_name: Display name of the OCR method to use
|
45 |
+
cancellation_flag: Optional flag to check for cancellation
|
46 |
+
**kwargs: Additional parser-specific options
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
str: The parsed content
|
50 |
+
"""
|
51 |
+
# Check for cancellation at the start
|
52 |
if cancellation_flag and cancellation_flag.is_set():
|
53 |
+
logging.info("Conversion cancelled at the start of parsing")
|
54 |
return "Conversion cancelled."
|
55 |
+
|
56 |
+
try:
|
57 |
+
parser = cls.create_parser(parser_name)
|
58 |
+
if not parser:
|
59 |
+
raise ValueError(f"Unknown parser: {parser_name}")
|
60 |
+
|
61 |
+
# Get the internal OCR method ID
|
62 |
+
ocr_method_id = ParserRegistry.get_ocr_method_id(parser_name, ocr_method_name)
|
63 |
+
if not ocr_method_id:
|
64 |
+
raise ValueError(f"Unknown OCR method: {ocr_method_name} for parser {parser_name}")
|
65 |
+
|
66 |
+
# Check for cancellation before parsing
|
67 |
+
if cancellation_flag and cancellation_flag.is_set():
|
68 |
+
logging.info("Conversion cancelled before parsing starts")
|
69 |
+
return "Conversion cancelled."
|
70 |
+
|
71 |
+
# Parse the document, passing the cancellation flag
|
72 |
+
kwargs['cancellation_flag'] = cancellation_flag
|
73 |
+
result = parser.parse(file_path, ocr_method=ocr_method_id, **kwargs)
|
74 |
+
|
75 |
+
# Check for cancellation after parsing
|
76 |
+
if cancellation_flag and cancellation_flag.is_set():
|
77 |
+
logging.info("Conversion cancelled after parsing completes")
|
78 |
+
return "Conversion cancelled."
|
79 |
+
|
80 |
+
return result
|
81 |
|
82 |
+
except Exception as e:
|
83 |
+
logging.error(f"Error in parse_document: {str(e)}")
|
84 |
+
# Check if the error was due to cancellation
|
85 |
+
if cancellation_flag and cancellation_flag.is_set():
|
86 |
+
return "Conversion cancelled."
|
87 |
+
raise
|