AnseMin commited on
Commit
ad248f7
·
1 Parent(s): 2184c47

script to convert markdown to latex, changing UI output to fit right with got ocr

Browse files
src/parsers/got_ocr_parser.py CHANGED
@@ -11,6 +11,7 @@ os.environ["TORCH_AMP_AUTOCAST_DTYPE"] = "float16"
11
 
12
  from src.parsers.parser_interface import DocumentParser
13
  from src.parsers.parser_registry import ParserRegistry
 
14
 
15
  # Configure logging
16
  logger = logging.getLogger(__name__)
@@ -159,7 +160,7 @@ class GotOcrParser(DocumentParser):
159
  **kwargs: Additional arguments to pass to the model
160
 
161
  Returns:
162
- Extracted text from the image
163
  """
164
  # Verify dependencies are installed
165
  if not self._check_dependencies():
@@ -211,6 +212,12 @@ class GotOcrParser(DocumentParser):
211
  str(file_path),
212
  ocr_type='ocr'
213
  )
 
 
 
 
 
 
214
  return result
215
  except RuntimeError as e:
216
  # Check if it's a bfloat16 error
@@ -243,6 +250,12 @@ class GotOcrParser(DocumentParser):
243
 
244
  # Restore default dtype
245
  torch.set_default_dtype(original_dtype)
 
 
 
 
 
 
246
  return result
247
  except Exception as inner_e:
248
  logger.error(f"Float16 fallback failed: {str(inner_e)}")
 
11
 
12
  from src.parsers.parser_interface import DocumentParser
13
  from src.parsers.parser_registry import ParserRegistry
14
+ from src.utils.latex_converter import latex_to_markdown
15
 
16
  # Configure logging
17
  logger = logging.getLogger(__name__)
 
160
  **kwargs: Additional arguments to pass to the model
161
 
162
  Returns:
163
+ Extracted text from the image, converted to Markdown if formatted
164
  """
165
  # Verify dependencies are installed
166
  if not self._check_dependencies():
 
212
  str(file_path),
213
  ocr_type='ocr'
214
  )
215
+
216
+ # Convert LaTeX to Markdown for better display in UI
217
+ if ocr_type == "format":
218
+ logger.info("Converting formatted LaTeX output to Markdown")
219
+ result = latex_to_markdown(result)
220
+
221
  return result
222
  except RuntimeError as e:
223
  # Check if it's a bfloat16 error
 
250
 
251
  # Restore default dtype
252
  torch.set_default_dtype(original_dtype)
253
+
254
+ # Convert LaTeX to Markdown for better display in UI
255
+ if ocr_type == "format":
256
+ logger.info("Converting formatted LaTeX output to Markdown")
257
+ result = latex_to_markdown(result)
258
+
259
  return result
260
  except Exception as inner_e:
261
  logger.error(f"Float16 fallback failed: {str(inner_e)}")
src/ui/ui.py CHANGED
@@ -80,13 +80,13 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
80
  # Check if we should cancel before starting
81
  if is_cancelled:
82
  logger.info("Conversion cancelled before starting")
83
- return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
84
 
85
  # Validate file type for the selected parser
86
  is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
87
  if not is_valid:
88
  logger.error(f"File validation error: {error_msg}")
89
- return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
90
 
91
  logger.info("Starting conversion with cancellation flag cleared")
92
 
@@ -107,14 +107,14 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
107
  thread.join(timeout=0.5)
108
  if thread.is_alive():
109
  logger.warning("Thread did not finish within timeout")
110
- return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
111
 
112
  # Sleep briefly to avoid busy waiting
113
  time.sleep(0.1)
114
 
115
  # Thread has completed, check results
116
  if results["error"]:
117
- return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
118
 
119
  content = results["content"]
120
  download_file = results["download_file"]
@@ -122,14 +122,14 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
122
  # If conversion returned a cancellation message
123
  if content == "Conversion cancelled.":
124
  logger.info("Converter returned cancellation message")
125
- return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
126
 
127
  # Format the content and wrap it in the scrollable container
128
  formatted_content = format_markdown_content(str(content))
129
  html_output = f"<div class='output-container'>{formatted_content}</div>"
130
 
131
  logger.info("Conversion completed successfully")
132
- return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
133
 
134
  def create_ui():
135
  with gr.Blocks(css="""
 
80
  # Check if we should cancel before starting
81
  if is_cancelled:
82
  logger.info("Conversion cancelled before starting")
83
+ return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
84
 
85
  # Validate file type for the selected parser
86
  is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
87
  if not is_valid:
88
  logger.error(f"File validation error: {error_msg}")
89
+ return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
90
 
91
  logger.info("Starting conversion with cancellation flag cleared")
92
 
 
107
  thread.join(timeout=0.5)
108
  if thread.is_alive():
109
  logger.warning("Thread did not finish within timeout")
110
+ return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
111
 
112
  # Sleep briefly to avoid busy waiting
113
  time.sleep(0.1)
114
 
115
  # Thread has completed, check results
116
  if results["error"]:
117
+ return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
118
 
119
  content = results["content"]
120
  download_file = results["download_file"]
 
122
  # If conversion returned a cancellation message
123
  if content == "Conversion cancelled.":
124
  logger.info("Converter returned cancellation message")
125
+ return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
126
 
127
  # Format the content and wrap it in the scrollable container
128
  formatted_content = format_markdown_content(str(content))
129
  html_output = f"<div class='output-container'>{formatted_content}</div>"
130
 
131
  logger.info("Conversion completed successfully")
132
+ return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
133
 
134
  def create_ui():
135
  with gr.Blocks(css="""
src/utils/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Utility functions for the Markit application."""
2
+
3
+ from src.utils.latex_converter import latex_to_markdown
4
+
5
+ __all__ = ['latex_to_markdown']
src/utils/latex_converter.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+
4
+ # Configure logging
5
+ logger = logging.getLogger(__name__)
6
+
7
+ def latex_to_markdown(latex_text):
8
+ """
9
+ Convert LaTeX formatted text from GOT-OCR to Markdown.
10
+
11
+ Args:
12
+ latex_text (str): LaTeX formatted text
13
+
14
+ Returns:
15
+ str: Markdown formatted text
16
+ """
17
+ if not latex_text:
18
+ return ""
19
+
20
+ logger.info("Converting LaTeX to Markdown")
21
+
22
+ # Make a copy of the input text
23
+ md_text = latex_text
24
+
25
+ # Handle LaTeX tables
26
+ md_text = convert_latex_tables(md_text)
27
+
28
+ # Handle LaTeX math environments
29
+ md_text = convert_math_environments(md_text)
30
+
31
+ # Handle LaTeX formatting commands
32
+ md_text = convert_formatting_commands(md_text)
33
+
34
+ # Handle LaTeX lists
35
+ md_text = convert_latex_lists(md_text)
36
+
37
+ # Clean up any remaining LaTeX-specific syntax
38
+ md_text = cleanup_latex(md_text)
39
+
40
+ logger.info("LaTeX to Markdown conversion completed")
41
+ return md_text
42
+
43
+ def convert_latex_tables(latex_text):
44
+ """Convert LaTeX tables to Markdown tables."""
45
+ result = latex_text
46
+
47
+ # Detect and convert tabular environments
48
+ tabular_pattern = r'\\begin\{(tabular|table)\}(.*?)\\end\{(tabular|table)\}'
49
+
50
+ def replace_table(match):
51
+ table_content = match.group(2)
52
+
53
+ # Extract rows
54
+ rows = re.split(r'\\\\', table_content)
55
+ md_rows = []
56
+
57
+ # Create header separator after first row
58
+ if rows:
59
+ first_row = rows[0]
60
+ # Count columns based on & separators
61
+ col_count = first_row.count('&') + 1
62
+
63
+ # Process rows
64
+ for i, row in enumerate(rows):
65
+ # Skip empty rows
66
+ if not row.strip():
67
+ continue
68
+
69
+ # Split by & to get cells
70
+ cells = row.split('&')
71
+ # Clean cell content
72
+ cells = [cell.strip().replace('\\hline', '') for cell in cells]
73
+
74
+ # Join cells with | for Markdown table format
75
+ md_row = '| ' + ' | '.join(cells) + ' |'
76
+ md_rows.append(md_row)
77
+
78
+ # Add header separator after first row
79
+ if i == 0:
80
+ md_rows.append('| ' + ' | '.join(['---'] * col_count) + ' |')
81
+
82
+ return '\n'.join(md_rows)
83
+
84
+ # Replace all tabular environments
85
+ result = re.sub(tabular_pattern, replace_table, result, flags=re.DOTALL)
86
+ return result
87
+
88
+ def convert_math_environments(latex_text):
89
+ """Convert LaTeX math environments to Markdown math syntax."""
90
+ result = latex_text
91
+
92
+ # Convert equation environments to $$ ... $$ format
93
+ result = re.sub(r'\\begin\{equation\}(.*?)\\end\{equation\}', r'$$\1$$', result, flags=re.DOTALL)
94
+ result = re.sub(r'\\begin\{align\}(.*?)\\end\{align\}', r'$$\1$$', result, flags=re.DOTALL)
95
+ result = re.sub(r'\\begin\{eqnarray\}(.*?)\\end\{eqnarray\}', r'$$\1$$', result, flags=re.DOTALL)
96
+
97
+ # Convert inline math $ ... $ (if not already in right format)
98
+ result = re.sub(r'\\(\(|\))', '$', result)
99
+
100
+ # Handle standalone math expressions
101
+ result = re.sub(r'\\begin\{math\}(.*?)\\end\{math\}', r'$\1$', result, flags=re.DOTALL)
102
+
103
+ return result
104
+
105
+ def convert_formatting_commands(latex_text):
106
+ """Convert LaTeX formatting commands to Markdown syntax."""
107
+ result = latex_text
108
+
109
+ # Bold: \textbf{text} -> **text**
110
+ result = re.sub(r'\\textbf\{([^}]*)\}', r'**\1**', result)
111
+ result = re.sub(r'\\bf\{([^}]*)\}', r'**\1**', result)
112
+
113
+ # Italic: \textit{text} -> *text*
114
+ result = re.sub(r'\\textit\{([^}]*)\}', r'*\1*', result)
115
+ result = re.sub(r'\\it\{([^}]*)\}', r'*\1*', result)
116
+ result = re.sub(r'\\emph\{([^}]*)\}', r'*\1*', result)
117
+
118
+ # Underline: don't have direct equivalent in MD, use emphasis
119
+ result = re.sub(r'\\underline\{([^}]*)\}', r'_\1_', result)
120
+
121
+ # Section headings
122
+ result = re.sub(r'\\section\{([^}]*)\}', r'## \1', result)
123
+ result = re.sub(r'\\subsection\{([^}]*)\}', r'### \1', result)
124
+ result = re.sub(r'\\subsubsection\{([^}]*)\}', r'#### \1', result)
125
+
126
+ # Remove \title command
127
+ result = re.sub(r'\\title\{([^}]*)\}', r'# \1', result)
128
+
129
+ return result
130
+
131
+ def convert_latex_lists(latex_text):
132
+ """Convert LaTeX lists to Markdown lists."""
133
+ result = latex_text
134
+
135
+ # Handle itemize (unordered lists)
136
+ itemize_pattern = r'\\begin\{itemize\}(.*?)\\end\{itemize\}'
137
+
138
+ def replace_itemize(match):
139
+ list_content = match.group(1)
140
+ items = re.findall(r'\\item\s+(.*?)(?=\\item|$)', list_content, re.DOTALL)
141
+ return '\n' + '\n'.join([f'- {item.strip()}' for item in items]) + '\n'
142
+
143
+ result = re.sub(itemize_pattern, replace_itemize, result, flags=re.DOTALL)
144
+
145
+ # Handle enumerate (ordered lists)
146
+ enumerate_pattern = r'\\begin\{enumerate\}(.*?)\\end\{enumerate\}'
147
+
148
+ def replace_enumerate(match):
149
+ list_content = match.group(1)
150
+ items = re.findall(r'\\item\s+(.*?)(?=\\item|$)', list_content, re.DOTALL)
151
+ return '\n' + '\n'.join([f'{i+1}. {item.strip()}' for i, item in enumerate(items)]) + '\n'
152
+
153
+ result = re.sub(enumerate_pattern, replace_enumerate, result, flags=re.DOTALL)
154
+
155
+ return result
156
+
157
+ def cleanup_latex(latex_text):
158
+ """Clean up any remaining LaTeX-specific syntax."""
159
+ result = latex_text
160
+
161
+ # Remove LaTeX document structure commands
162
+ result = re.sub(r'\\begin\{document\}|\\end\{document\}', '', result)
163
+ result = re.sub(r'\\maketitle', '', result)
164
+ result = re.sub(r'\\documentclass\{[^}]*\}', '', result)
165
+ result = re.sub(r'\\usepackage\{[^}]*\}', '', result)
166
+
167
+ # Convert special characters
168
+ latex_special_chars = {
169
+ r'\&': '&',
170
+ r'\%': '%',
171
+ r'\$': '$',
172
+ r'\#': '#',
173
+ r'\_': '_',
174
+ r'\{': '{',
175
+ r'\}': '}',
176
+ r'~': ' ',
177
+ r'\ldots': '...'
178
+ }
179
+
180
+ for latex_char, md_char in latex_special_chars.items():
181
+ result = result.replace(latex_char, md_char)
182
+
183
+ # Fix extra whitespace
184
+ result = re.sub(r'\n\s*\n\s*\n', '\n\n', result)
185
+
186
+ return result