AnseMin commited on
Commit
33f1b65
·
1 Parent(s): 5910e0d

Latex2Markdown display changes --attemtp1

Browse files
app.py CHANGED
@@ -50,8 +50,24 @@ try:
50
  import transformers
51
  print(f"Transformers version: {transformers.__version__}")
52
  except ImportError:
53
- print("WARNING: Transformers not installed. Installing transformers from GitHub...")
54
- subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main", "accelerate", "verovio"], check=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # Check if numpy is installed with the correct version
57
  try:
 
50
  import transformers
51
  print(f"Transformers version: {transformers.__version__}")
52
  except ImportError:
53
+ print("WARNING: Transformers not installed. Installing from GitHub...")
54
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main"], check=False)
55
+
56
+ # Check if latex2markdown module is installed (needed for LaTeX conversion)
57
+ try:
58
+ import latex2markdown
59
+ print("LaTeX2Markdown module found for advanced LaTeX conversion")
60
+ except ImportError:
61
+ print("WARNING: LaTeX2Markdown module not found. Installing...")
62
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q", "latex2markdown"], check=False)
63
+
64
+ # Check if regex module is installed (needed for LaTeX conversion)
65
+ try:
66
+ import regex
67
+ print(f"Regex module found: {regex.__version__ if hasattr(regex, '__version__') else 'version unknown'}")
68
+ except ImportError:
69
+ print("WARNING: Regex module not found. Installing...")
70
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q", "regex>=2023.0.0"], check=False)
71
 
72
  # Check if numpy is installed with the correct version
73
  try:
requirements.txt CHANGED
@@ -14,6 +14,7 @@ opencv-python # Match exact dependency from GOT-OCR
14
  python-dotenv>=1.0.0
15
  pydantic==2.7.1
16
  latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
 
17
 
18
  # Gemini API client
19
  google-genai>=0.1.0
 
14
  python-dotenv>=1.0.0
15
  pydantic==2.7.1
16
  latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
17
+ regex>=2023.0.0 # For advanced regex pattern matching
18
 
19
  # Gemini API client
20
  google-genai>=0.1.0
setup.sh CHANGED
@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
29
  echo "Installing Python dependencies..."
30
  pip install -q -U pillow opencv-python
31
  pip install -q -U google-genai
32
- pip install -q -U latex2markdown
33
  echo "Python dependencies installed successfully"
34
 
35
  # Install GOT-OCR transformers dependencies
 
29
  echo "Installing Python dependencies..."
30
  pip install -q -U pillow opencv-python
31
  pip install -q -U google-genai
32
+ pip install -q -U latex2markdown regex>=2023.0.0
33
  echo "Python dependencies installed successfully"
34
 
35
  # Install GOT-OCR transformers dependencies
src/core/parser_factory.py CHANGED
@@ -7,6 +7,9 @@ import time
7
  from src.parsers.parser_interface import DocumentParser
8
  from src.parsers.parser_registry import ParserRegistry
9
 
 
 
 
10
 
11
  class ParserFactory:
12
  """Factory for creating parser instances."""
@@ -91,5 +94,10 @@ class ParserFactory:
91
  # Check one more time after parsing completes
92
  if check_cancellation():
93
  return "Conversion cancelled."
 
 
 
 
 
94
 
95
  return result
 
7
  from src.parsers.parser_interface import DocumentParser
8
  from src.parsers.parser_registry import ParserRegistry
9
 
10
+ # Import the GOT-OCR integration module for post-processing
11
+ from src.parsers.got_ocr_integration import process_got_ocr_output
12
+
13
 
14
  class ParserFactory:
15
  """Factory for creating parser instances."""
 
94
  # Check one more time after parsing completes
95
  if check_cancellation():
96
  return "Conversion cancelled."
97
+
98
+ # Post-process the result for GOT-OCR parser
99
+ if "GOT-OCR" in parser_name:
100
+ logging.info(f"Post-processing GOT-OCR output for {ocr_method_name}")
101
+ result = process_got_ocr_output(result, ocr_method_name, output_format)
102
 
103
  return result
src/parsers/got_ocr_integration.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, Dict, Any
3
+ import os
4
+ from pathlib import Path
5
+
6
+ # Import the LaTeX converter utility
7
+ from src.utils.latex_converter import convert_latex_to_markdown
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
11
+ logger = logging.getLogger(__name__)
12
+
13
+ def process_got_ocr_output(output_text: str, ocr_method: str, output_format: str) -> str:
14
+ """
15
+ Process the output from GOT-OCR parser and convert if needed.
16
+
17
+ Args:
18
+ output_text: The raw output text from the GOT-OCR parser
19
+ ocr_method: The OCR method used (Plain Text, Formatted Text)
20
+ output_format: The desired output format (Markdown, etc.)
21
+
22
+ Returns:
23
+ str: The processed text
24
+ """
25
+ if not output_text:
26
+ return ""
27
+
28
+ # If not using formatted text or not requesting Markdown, return the original text
29
+ if ocr_method.lower() != "formatted text" or output_format.lower() != "markdown":
30
+ return output_text
31
+
32
+ # Process the formatted text (LaTeX) into enhanced Markdown
33
+ logger.info("Converting LaTeX output to enhanced Markdown format")
34
+ try:
35
+ markdown_text = convert_latex_to_markdown(output_text)
36
+ logger.info("LaTeX to Markdown conversion successful")
37
+ return markdown_text
38
+ except Exception as e:
39
+ logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
40
+ # Return the original text if conversion fails
41
+ return output_text
src/parsers/got_ocr_parser.py CHANGED
@@ -227,9 +227,10 @@ class GotOcrParser(DocumentParser):
227
  skip_special_tokens=True,
228
  )
229
 
230
- # Convert to Markdown if it's formatted
231
- l2m = latex2markdown.LaTeX2Markdown(result)
232
- result = l2m.to_markdown()
 
233
  else:
234
  # Plain text mode
235
  inputs = processor([image], return_tensors="pt")
@@ -318,9 +319,9 @@ class GotOcrParser(DocumentParser):
318
  skip_special_tokens=True,
319
  )
320
 
321
- # Convert to Markdown if it's formatted
322
- l2m = latex2markdown.LaTeX2Markdown(result)
323
- result = l2m.to_markdown()
324
  else:
325
  # Plain text mode
326
  inputs = processor([image], return_tensors="pt")
 
227
  skip_special_tokens=True,
228
  )
229
 
230
+ # Return raw LaTeX output - let post-processing handle conversion
231
+ # This allows for more advanced conversion in the integration module
232
+ logger.info("Returning raw LaTeX output for external processing")
233
+
234
  else:
235
  # Plain text mode
236
  inputs = processor([image], return_tensors="pt")
 
319
  skip_special_tokens=True,
320
  )
321
 
322
+ # Return raw LaTeX output - let post-processing handle conversion
323
+ # This allows for more advanced conversion in the integration module
324
+ logger.info("Returning raw LaTeX output for external processing")
325
  else:
326
  # Plain text mode
327
  inputs = processor([image], return_tensors="pt")
src/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Utilities package for Markit.
3
+ Contains shared utility functions and helper modules.
4
+ """
src/utils/latex_converter.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ from typing import Dict, List, Tuple, Optional
4
+ import latex2markdown
5
+
6
+ # Configure logging
7
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class LatexConverter:
11
+ """Enhanced LaTeX to Markdown converter that handles complex LaTeX structures."""
12
+
13
+ @staticmethod
14
+ def convert(latex_text: str) -> str:
15
+ """
16
+ Convert LaTeX text to Markdown, with special handling for tables and other structures.
17
+
18
+ Args:
19
+ latex_text: Raw LaTeX text from the GOT-OCR model
20
+
21
+ Returns:
22
+ str: Converted Markdown text
23
+ """
24
+ if not latex_text or not isinstance(latex_text, str):
25
+ return ""
26
+
27
+ # Process the text in stages
28
+ processed_text = latex_text
29
+
30
+ # Stage 1: Pre-process tables before standard conversion
31
+ processed_text = LatexConverter._preprocess_tables(processed_text)
32
+
33
+ # Stage 2: Convert using latex2markdown library
34
+ try:
35
+ # Use the standard latex2markdown library as a base - FOLLOWING OFFICIAL DOCUMENTATION
36
+ l2m = latex2markdown.LaTeX2Markdown(processed_text)
37
+ processed_text = l2m.to_markdown()
38
+ except Exception as e:
39
+ logger.error(f"Error in standard latex2markdown conversion: {str(e)}")
40
+ # Continue with our custom processing even if the standard library fails
41
+
42
+ # Stage 3: Post-process to fix any remaining issues
43
+ processed_text = LatexConverter._postprocess_markdown(processed_text)
44
+
45
+ return processed_text
46
+
47
+ @staticmethod
48
+ def _preprocess_tables(latex_text: str) -> str:
49
+ """
50
+ Pre-process LaTeX tables to ensure they convert correctly.
51
+
52
+ Args:
53
+ latex_text: Raw LaTeX text
54
+
55
+ Returns:
56
+ str: Pre-processed LaTeX text with table modifications
57
+ """
58
+ processed_text = latex_text
59
+
60
+ # Find all tabular environments
61
+ table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
62
+ tables = re.findall(table_pattern, processed_text, re.DOTALL)
63
+
64
+ for i, table_content in enumerate(tables):
65
+ # Extract the column specification
66
+ col_spec_match = re.search(r'{([^}]*)}', table_content)
67
+ if not col_spec_match:
68
+ continue
69
+
70
+ # Process the table content
71
+ rows_text = re.sub(r'{[^}]*}', '', table_content, count=1) # Remove the column spec
72
+
73
+ # Split into rows by \\ or \hline
74
+ rows = re.split(r'\\\\|\\hline', rows_text)
75
+ rows = [row.strip() for row in rows if row.strip()]
76
+
77
+ # Calculate number of columns based on the number of & in the first non-empty row plus 1
78
+ for row in rows:
79
+ if '&' in row:
80
+ num_cols = row.count('&') + 1
81
+ break
82
+ else:
83
+ num_cols = 1 # Default if no & found
84
+
85
+ # Create a clean tabular environment that's easier to parse
86
+ clean_table = f"\\begin{{tabular}}{{{'|'.join(['c'] * num_cols)}}}\n"
87
+
88
+ for row in rows:
89
+ if row.strip():
90
+ clean_row = ' & '.join([cell.strip() for cell in row.split('&')])
91
+ clean_table += clean_row + " \\\\\n"
92
+
93
+ clean_table += "\\end{tabular}"
94
+
95
+ # Replace the original table with the clean one
96
+ processed_text = processed_text.replace(
97
+ f"\\begin{tabular}{table_content}\\end{tabular}",
98
+ clean_table
99
+ )
100
+
101
+ return processed_text
102
+
103
+ @staticmethod
104
+ def _postprocess_markdown(markdown_text: str) -> str:
105
+ """
106
+ Post-process the converted Markdown to fix any remaining issues.
107
+
108
+ Args:
109
+ markdown_text: Converted Markdown text
110
+
111
+ Returns:
112
+ str: Post-processed Markdown text
113
+ """
114
+ processed_text = markdown_text
115
+
116
+ # Fix common issues with tables
117
+ # 1. Fix pipe tables that may be malformed
118
+ table_lines = []
119
+ in_table = False
120
+
121
+ for line in processed_text.split('\n'):
122
+ if '|' in line and not line.strip().startswith('|') and not in_table:
123
+ # This might be the start of a table, add the missing pipe
124
+ line = '| ' + line
125
+ in_table = True
126
+
127
+ if in_table:
128
+ if '|' in line:
129
+ # Ensure line ends with pipe
130
+ if not line.strip().endswith('|'):
131
+ line = line + ' |'
132
+ table_lines.append(line)
133
+ else:
134
+ # End of table
135
+ in_table = False
136
+
137
+ # If this is a table, add a header separator row after the first row
138
+ if len(table_lines) > 0:
139
+ col_count = table_lines[0].count('|') - 1
140
+ separator = '| ' + ' | '.join(['---'] * col_count) + ' |'
141
+ table_lines.insert(1, separator)
142
+
143
+ # Add the current line and the processed table
144
+ for table_line in table_lines:
145
+ processed_text = processed_text.replace(table_line, table_line)
146
+ table_lines = []
147
+
148
+ # Fix math blocks
149
+ processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
150
+ processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)
151
+
152
+ # Fix formatting issues
153
+ processed_text = processed_text.replace('\\textbf{', '**')
154
+ processed_text = processed_text.replace('\\textit{', '*')
155
+ processed_text = processed_text.replace('}', '') # Remove closing braces
156
+
157
+ # Fix escape sequences
158
+ processed_text = processed_text.replace('\\%', '%')
159
+ processed_text = processed_text.replace('\\$', '$')
160
+ processed_text = processed_text.replace('\\&', '&')
161
+
162
+ return processed_text
163
+
164
+ def convert_latex_to_markdown(latex_text: str) -> str:
165
+ """
166
+ Convenience function to convert LaTeX to Markdown.
167
+
168
+ Args:
169
+ latex_text: Raw LaTeX text from the GOT-OCR model
170
+
171
+ Returns:
172
+ str: Converted Markdown text
173
+ """
174
+ return LatexConverter.convert(latex_text)