Spaces:
Running
on
Zero
Running
on
Zero
script to convert markdown to latex, changing UI output to fit right with got ocr
Browse files- src/parsers/got_ocr_parser.py +14 -1
- src/ui/ui.py +6 -6
- src/utils/__init__.py +5 -0
- src/utils/latex_converter.py +186 -0
src/parsers/got_ocr_parser.py
CHANGED
@@ -11,6 +11,7 @@ os.environ["TORCH_AMP_AUTOCAST_DTYPE"] = "float16"
|
|
11 |
|
12 |
from src.parsers.parser_interface import DocumentParser
|
13 |
from src.parsers.parser_registry import ParserRegistry
|
|
|
14 |
|
15 |
# Configure logging
|
16 |
logger = logging.getLogger(__name__)
|
@@ -159,7 +160,7 @@ class GotOcrParser(DocumentParser):
|
|
159 |
**kwargs: Additional arguments to pass to the model
|
160 |
|
161 |
Returns:
|
162 |
-
Extracted text from the image
|
163 |
"""
|
164 |
# Verify dependencies are installed
|
165 |
if not self._check_dependencies():
|
@@ -211,6 +212,12 @@ class GotOcrParser(DocumentParser):
|
|
211 |
str(file_path),
|
212 |
ocr_type='ocr'
|
213 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
return result
|
215 |
except RuntimeError as e:
|
216 |
# Check if it's a bfloat16 error
|
@@ -243,6 +250,12 @@ class GotOcrParser(DocumentParser):
|
|
243 |
|
244 |
# Restore default dtype
|
245 |
torch.set_default_dtype(original_dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
return result
|
247 |
except Exception as inner_e:
|
248 |
logger.error(f"Float16 fallback failed: {str(inner_e)}")
|
|
|
11 |
|
12 |
from src.parsers.parser_interface import DocumentParser
|
13 |
from src.parsers.parser_registry import ParserRegistry
|
14 |
+
from src.utils.latex_converter import latex_to_markdown
|
15 |
|
16 |
# Configure logging
|
17 |
logger = logging.getLogger(__name__)
|
|
|
160 |
**kwargs: Additional arguments to pass to the model
|
161 |
|
162 |
Returns:
|
163 |
+
Extracted text from the image, converted to Markdown if formatted
|
164 |
"""
|
165 |
# Verify dependencies are installed
|
166 |
if not self._check_dependencies():
|
|
|
212 |
str(file_path),
|
213 |
ocr_type='ocr'
|
214 |
)
|
215 |
+
|
216 |
+
# Convert LaTeX to Markdown for better display in UI
|
217 |
+
if ocr_type == "format":
|
218 |
+
logger.info("Converting formatted LaTeX output to Markdown")
|
219 |
+
result = latex_to_markdown(result)
|
220 |
+
|
221 |
return result
|
222 |
except RuntimeError as e:
|
223 |
# Check if it's a bfloat16 error
|
|
|
250 |
|
251 |
# Restore default dtype
|
252 |
torch.set_default_dtype(original_dtype)
|
253 |
+
|
254 |
+
# Convert LaTeX to Markdown for better display in UI
|
255 |
+
if ocr_type == "format":
|
256 |
+
logger.info("Converting formatted LaTeX output to Markdown")
|
257 |
+
result = latex_to_markdown(result)
|
258 |
+
|
259 |
return result
|
260 |
except Exception as inner_e:
|
261 |
logger.error(f"Float16 fallback failed: {str(inner_e)}")
|
src/ui/ui.py
CHANGED
@@ -80,13 +80,13 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
|
|
80 |
# Check if we should cancel before starting
|
81 |
if is_cancelled:
|
82 |
logger.info("Conversion cancelled before starting")
|
83 |
-
return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
84 |
|
85 |
# Validate file type for the selected parser
|
86 |
is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
|
87 |
if not is_valid:
|
88 |
logger.error(f"File validation error: {error_msg}")
|
89 |
-
return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
90 |
|
91 |
logger.info("Starting conversion with cancellation flag cleared")
|
92 |
|
@@ -107,14 +107,14 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
|
|
107 |
thread.join(timeout=0.5)
|
108 |
if thread.is_alive():
|
109 |
logger.warning("Thread did not finish within timeout")
|
110 |
-
return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
111 |
|
112 |
# Sleep briefly to avoid busy waiting
|
113 |
time.sleep(0.1)
|
114 |
|
115 |
# Thread has completed, check results
|
116 |
if results["error"]:
|
117 |
-
return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
118 |
|
119 |
content = results["content"]
|
120 |
download_file = results["download_file"]
|
@@ -122,14 +122,14 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
|
|
122 |
# If conversion returned a cancellation message
|
123 |
if content == "Conversion cancelled.":
|
124 |
logger.info("Converter returned cancellation message")
|
125 |
-
return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
126 |
|
127 |
# Format the content and wrap it in the scrollable container
|
128 |
formatted_content = format_markdown_content(str(content))
|
129 |
html_output = f"<div class='output-container'>{formatted_content}</div>"
|
130 |
|
131 |
logger.info("Conversion completed successfully")
|
132 |
-
return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
133 |
|
134 |
def create_ui():
|
135 |
with gr.Blocks(css="""
|
|
|
80 |
# Check if we should cancel before starting
|
81 |
if is_cancelled:
|
82 |
logger.info("Conversion cancelled before starting")
|
83 |
+
return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
84 |
|
85 |
# Validate file type for the selected parser
|
86 |
is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
|
87 |
if not is_valid:
|
88 |
logger.error(f"File validation error: {error_msg}")
|
89 |
+
return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
90 |
|
91 |
logger.info("Starting conversion with cancellation flag cleared")
|
92 |
|
|
|
107 |
thread.join(timeout=0.5)
|
108 |
if thread.is_alive():
|
109 |
logger.warning("Thread did not finish within timeout")
|
110 |
+
return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
111 |
|
112 |
# Sleep briefly to avoid busy waiting
|
113 |
time.sleep(0.1)
|
114 |
|
115 |
# Thread has completed, check results
|
116 |
if results["error"]:
|
117 |
+
return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
118 |
|
119 |
content = results["content"]
|
120 |
download_file = results["download_file"]
|
|
|
122 |
# If conversion returned a cancellation message
|
123 |
if content == "Conversion cancelled.":
|
124 |
logger.info("Converter returned cancellation message")
|
125 |
+
return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
126 |
|
127 |
# Format the content and wrap it in the scrollable container
|
128 |
formatted_content = format_markdown_content(str(content))
|
129 |
html_output = f"<div class='output-container'>{formatted_content}</div>"
|
130 |
|
131 |
logger.info("Conversion completed successfully")
|
132 |
+
return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
133 |
|
134 |
def create_ui():
|
135 |
with gr.Blocks(css="""
|
src/utils/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utility functions for the Markit application."""
|
2 |
+
|
3 |
+
from src.utils.latex_converter import latex_to_markdown
|
4 |
+
|
5 |
+
__all__ = ['latex_to_markdown']
|
src/utils/latex_converter.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import logging
|
3 |
+
|
4 |
+
# Configure logging
|
5 |
+
logger = logging.getLogger(__name__)
|
6 |
+
|
7 |
+
def latex_to_markdown(latex_text):
|
8 |
+
"""
|
9 |
+
Convert LaTeX formatted text from GOT-OCR to Markdown.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
latex_text (str): LaTeX formatted text
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
str: Markdown formatted text
|
16 |
+
"""
|
17 |
+
if not latex_text:
|
18 |
+
return ""
|
19 |
+
|
20 |
+
logger.info("Converting LaTeX to Markdown")
|
21 |
+
|
22 |
+
# Make a copy of the input text
|
23 |
+
md_text = latex_text
|
24 |
+
|
25 |
+
# Handle LaTeX tables
|
26 |
+
md_text = convert_latex_tables(md_text)
|
27 |
+
|
28 |
+
# Handle LaTeX math environments
|
29 |
+
md_text = convert_math_environments(md_text)
|
30 |
+
|
31 |
+
# Handle LaTeX formatting commands
|
32 |
+
md_text = convert_formatting_commands(md_text)
|
33 |
+
|
34 |
+
# Handle LaTeX lists
|
35 |
+
md_text = convert_latex_lists(md_text)
|
36 |
+
|
37 |
+
# Clean up any remaining LaTeX-specific syntax
|
38 |
+
md_text = cleanup_latex(md_text)
|
39 |
+
|
40 |
+
logger.info("LaTeX to Markdown conversion completed")
|
41 |
+
return md_text
|
42 |
+
|
43 |
+
def convert_latex_tables(latex_text):
|
44 |
+
"""Convert LaTeX tables to Markdown tables."""
|
45 |
+
result = latex_text
|
46 |
+
|
47 |
+
# Detect and convert tabular environments
|
48 |
+
tabular_pattern = r'\\begin\{(tabular|table)\}(.*?)\\end\{(tabular|table)\}'
|
49 |
+
|
50 |
+
def replace_table(match):
|
51 |
+
table_content = match.group(2)
|
52 |
+
|
53 |
+
# Extract rows
|
54 |
+
rows = re.split(r'\\\\', table_content)
|
55 |
+
md_rows = []
|
56 |
+
|
57 |
+
# Create header separator after first row
|
58 |
+
if rows:
|
59 |
+
first_row = rows[0]
|
60 |
+
# Count columns based on & separators
|
61 |
+
col_count = first_row.count('&') + 1
|
62 |
+
|
63 |
+
# Process rows
|
64 |
+
for i, row in enumerate(rows):
|
65 |
+
# Skip empty rows
|
66 |
+
if not row.strip():
|
67 |
+
continue
|
68 |
+
|
69 |
+
# Split by & to get cells
|
70 |
+
cells = row.split('&')
|
71 |
+
# Clean cell content
|
72 |
+
cells = [cell.strip().replace('\\hline', '') for cell in cells]
|
73 |
+
|
74 |
+
# Join cells with | for Markdown table format
|
75 |
+
md_row = '| ' + ' | '.join(cells) + ' |'
|
76 |
+
md_rows.append(md_row)
|
77 |
+
|
78 |
+
# Add header separator after first row
|
79 |
+
if i == 0:
|
80 |
+
md_rows.append('| ' + ' | '.join(['---'] * col_count) + ' |')
|
81 |
+
|
82 |
+
return '\n'.join(md_rows)
|
83 |
+
|
84 |
+
# Replace all tabular environments
|
85 |
+
result = re.sub(tabular_pattern, replace_table, result, flags=re.DOTALL)
|
86 |
+
return result
|
87 |
+
|
88 |
+
def convert_math_environments(latex_text):
|
89 |
+
"""Convert LaTeX math environments to Markdown math syntax."""
|
90 |
+
result = latex_text
|
91 |
+
|
92 |
+
# Convert equation environments to $$ ... $$ format
|
93 |
+
result = re.sub(r'\\begin\{equation\}(.*?)\\end\{equation\}', r'$$\1$$', result, flags=re.DOTALL)
|
94 |
+
result = re.sub(r'\\begin\{align\}(.*?)\\end\{align\}', r'$$\1$$', result, flags=re.DOTALL)
|
95 |
+
result = re.sub(r'\\begin\{eqnarray\}(.*?)\\end\{eqnarray\}', r'$$\1$$', result, flags=re.DOTALL)
|
96 |
+
|
97 |
+
# Convert inline math $ ... $ (if not already in right format)
|
98 |
+
result = re.sub(r'\\(\(|\))', '$', result)
|
99 |
+
|
100 |
+
# Handle standalone math expressions
|
101 |
+
result = re.sub(r'\\begin\{math\}(.*?)\\end\{math\}', r'$\1$', result, flags=re.DOTALL)
|
102 |
+
|
103 |
+
return result
|
104 |
+
|
105 |
+
def convert_formatting_commands(latex_text):
|
106 |
+
"""Convert LaTeX formatting commands to Markdown syntax."""
|
107 |
+
result = latex_text
|
108 |
+
|
109 |
+
# Bold: \textbf{text} -> **text**
|
110 |
+
result = re.sub(r'\\textbf\{([^}]*)\}', r'**\1**', result)
|
111 |
+
result = re.sub(r'\\bf\{([^}]*)\}', r'**\1**', result)
|
112 |
+
|
113 |
+
# Italic: \textit{text} -> *text*
|
114 |
+
result = re.sub(r'\\textit\{([^}]*)\}', r'*\1*', result)
|
115 |
+
result = re.sub(r'\\it\{([^}]*)\}', r'*\1*', result)
|
116 |
+
result = re.sub(r'\\emph\{([^}]*)\}', r'*\1*', result)
|
117 |
+
|
118 |
+
# Underline: don't have direct equivalent in MD, use emphasis
|
119 |
+
result = re.sub(r'\\underline\{([^}]*)\}', r'_\1_', result)
|
120 |
+
|
121 |
+
# Section headings
|
122 |
+
result = re.sub(r'\\section\{([^}]*)\}', r'## \1', result)
|
123 |
+
result = re.sub(r'\\subsection\{([^}]*)\}', r'### \1', result)
|
124 |
+
result = re.sub(r'\\subsubsection\{([^}]*)\}', r'#### \1', result)
|
125 |
+
|
126 |
+
# Remove \title command
|
127 |
+
result = re.sub(r'\\title\{([^}]*)\}', r'# \1', result)
|
128 |
+
|
129 |
+
return result
|
130 |
+
|
131 |
+
def convert_latex_lists(latex_text):
|
132 |
+
"""Convert LaTeX lists to Markdown lists."""
|
133 |
+
result = latex_text
|
134 |
+
|
135 |
+
# Handle itemize (unordered lists)
|
136 |
+
itemize_pattern = r'\\begin\{itemize\}(.*?)\\end\{itemize\}'
|
137 |
+
|
138 |
+
def replace_itemize(match):
|
139 |
+
list_content = match.group(1)
|
140 |
+
items = re.findall(r'\\item\s+(.*?)(?=\\item|$)', list_content, re.DOTALL)
|
141 |
+
return '\n' + '\n'.join([f'- {item.strip()}' for item in items]) + '\n'
|
142 |
+
|
143 |
+
result = re.sub(itemize_pattern, replace_itemize, result, flags=re.DOTALL)
|
144 |
+
|
145 |
+
# Handle enumerate (ordered lists)
|
146 |
+
enumerate_pattern = r'\\begin\{enumerate\}(.*?)\\end\{enumerate\}'
|
147 |
+
|
148 |
+
def replace_enumerate(match):
|
149 |
+
list_content = match.group(1)
|
150 |
+
items = re.findall(r'\\item\s+(.*?)(?=\\item|$)', list_content, re.DOTALL)
|
151 |
+
return '\n' + '\n'.join([f'{i+1}. {item.strip()}' for i, item in enumerate(items)]) + '\n'
|
152 |
+
|
153 |
+
result = re.sub(enumerate_pattern, replace_enumerate, result, flags=re.DOTALL)
|
154 |
+
|
155 |
+
return result
|
156 |
+
|
157 |
+
def cleanup_latex(latex_text):
|
158 |
+
"""Clean up any remaining LaTeX-specific syntax."""
|
159 |
+
result = latex_text
|
160 |
+
|
161 |
+
# Remove LaTeX document structure commands
|
162 |
+
result = re.sub(r'\\begin\{document\}|\\end\{document\}', '', result)
|
163 |
+
result = re.sub(r'\\maketitle', '', result)
|
164 |
+
result = re.sub(r'\\documentclass\{[^}]*\}', '', result)
|
165 |
+
result = re.sub(r'\\usepackage\{[^}]*\}', '', result)
|
166 |
+
|
167 |
+
# Convert special characters
|
168 |
+
latex_special_chars = {
|
169 |
+
r'\&': '&',
|
170 |
+
r'\%': '%',
|
171 |
+
r'\$': '$',
|
172 |
+
r'\#': '#',
|
173 |
+
r'\_': '_',
|
174 |
+
r'\{': '{',
|
175 |
+
r'\}': '}',
|
176 |
+
r'~': ' ',
|
177 |
+
r'\ldots': '...'
|
178 |
+
}
|
179 |
+
|
180 |
+
for latex_char, md_char in latex_special_chars.items():
|
181 |
+
result = result.replace(latex_char, md_char)
|
182 |
+
|
183 |
+
# Fix extra whitespace
|
184 |
+
result = re.sub(r'\n\s*\n\s*\n', '\n\n', result)
|
185 |
+
|
186 |
+
return result
|