AnseMin commited on
Commit
23ad33e
·
1 Parent(s): 34d180e

restore to version 1

Browse files
app.py CHANGED
@@ -50,24 +50,8 @@ try:
50
  import transformers
51
  print(f"Transformers version: {transformers.__version__}")
52
  except ImportError:
53
- print("WARNING: Transformers not installed. Installing from GitHub...")
54
- subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main"], check=False)
55
-
56
- # Check if latex2markdown module is installed (needed for LaTeX conversion)
57
- try:
58
- import latex2markdown
59
- print("LaTeX2Markdown module found for advanced LaTeX conversion")
60
- except ImportError:
61
- print("WARNING: LaTeX2Markdown module not found. Installing...")
62
- subprocess.run([sys.executable, "-m", "pip", "install", "-q", "latex2markdown"], check=False)
63
-
64
- # Check if regex module is installed (needed for LaTeX conversion)
65
- try:
66
- import regex
67
- print(f"Regex module found: {regex.__version__ if hasattr(regex, '__version__') else 'version unknown'}")
68
- except ImportError:
69
- print("WARNING: Regex module not found. Installing...")
70
- subprocess.run([sys.executable, "-m", "pip", "install", "-q", "regex>=2023.0.0"], check=False)
71
 
72
  # Check if numpy is installed with the correct version
73
  try:
 
50
  import transformers
51
  print(f"Transformers version: {transformers.__version__}")
52
  except ImportError:
53
+ print("WARNING: Transformers not installed. Installing transformers from GitHub...")
54
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main", "accelerate", "verovio"], check=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # Check if numpy is installed with the correct version
57
  try:
requirements.txt CHANGED
@@ -14,7 +14,6 @@ opencv-python # Match exact dependency from GOT-OCR
14
  python-dotenv>=1.0.0
15
  pydantic==2.7.1
16
  latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
17
- regex>=2023.0.0 # For advanced regex pattern matching
18
 
19
  # Gemini API client
20
  google-genai>=0.1.0
 
14
  python-dotenv>=1.0.0
15
  pydantic==2.7.1
16
  latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
 
17
 
18
  # Gemini API client
19
  google-genai>=0.1.0
setup.sh CHANGED
@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
29
  echo "Installing Python dependencies..."
30
  pip install -q -U pillow opencv-python
31
  pip install -q -U google-genai
32
- pip install -q -U latex2markdown regex>=2023.0.0
33
  echo "Python dependencies installed successfully"
34
 
35
  # Install GOT-OCR transformers dependencies
 
29
  echo "Installing Python dependencies..."
30
  pip install -q -U pillow opencv-python
31
  pip install -q -U google-genai
32
+ pip install -q -U latex2markdown
33
  echo "Python dependencies installed successfully"
34
 
35
  # Install GOT-OCR transformers dependencies
src/core/parser_factory.py CHANGED
@@ -7,9 +7,6 @@ import time
7
  from src.parsers.parser_interface import DocumentParser
8
  from src.parsers.parser_registry import ParserRegistry
9
 
10
- # Import the GOT-OCR integration module for post-processing
11
- from src.parsers.got_ocr_integration import process_got_ocr_output
12
-
13
 
14
  class ParserFactory:
15
  """Factory for creating parser instances."""
@@ -94,10 +91,5 @@ class ParserFactory:
94
  # Check one more time after parsing completes
95
  if check_cancellation():
96
  return "Conversion cancelled."
97
-
98
- # Post-process the result for GOT-OCR parser
99
- if "GOT-OCR" in parser_name:
100
- logging.info(f"Post-processing GOT-OCR output for {ocr_method_name}")
101
- result = process_got_ocr_output(result, ocr_method_name, output_format)
102
 
103
  return result
 
7
  from src.parsers.parser_interface import DocumentParser
8
  from src.parsers.parser_registry import ParserRegistry
9
 
 
 
 
10
 
11
  class ParserFactory:
12
  """Factory for creating parser instances."""
 
91
  # Check one more time after parsing completes
92
  if check_cancellation():
93
  return "Conversion cancelled."
 
 
 
 
 
94
 
95
  return result
src/parsers/got_ocr_integration.py DELETED
@@ -1,41 +0,0 @@
1
- import logging
2
- from typing import Optional, Dict, Any
3
- import os
4
- from pathlib import Path
5
-
6
- # Import the LaTeX converter utility
7
- from src.utils.latex_converter import convert_latex_to_markdown
8
-
9
- # Configure logging
10
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
11
- logger = logging.getLogger(__name__)
12
-
13
- def process_got_ocr_output(output_text: str, ocr_method: str, output_format: str) -> str:
14
- """
15
- Process the output from GOT-OCR parser and convert if needed.
16
-
17
- Args:
18
- output_text: The raw output text from the GOT-OCR parser
19
- ocr_method: The OCR method used (Plain Text, Formatted Text)
20
- output_format: The desired output format (Markdown, etc.)
21
-
22
- Returns:
23
- str: The processed text
24
- """
25
- if not output_text:
26
- return ""
27
-
28
- # If not using formatted text or not requesting Markdown, return the original text
29
- if ocr_method.lower() != "formatted text" or output_format.lower() != "markdown":
30
- return output_text
31
-
32
- # Process the formatted text (LaTeX) into enhanced Markdown
33
- logger.info("Converting LaTeX output to enhanced Markdown format")
34
- try:
35
- markdown_text = convert_latex_to_markdown(output_text)
36
- logger.info("LaTeX to Markdown conversion successful")
37
- return markdown_text
38
- except Exception as e:
39
- logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
40
- # Return the original text if conversion fails
41
- return output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- """
2
- Utilities package for Markit.
3
- Contains shared utility functions and helper modules.
4
- """
 
 
 
 
 
src/utils/latex_converter.py DELETED
@@ -1,194 +0,0 @@
1
- import re
2
- import logging
3
- from typing import Dict, List, Tuple, Optional
4
- import latex2markdown
5
-
6
- # Configure logging
7
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
8
- logger = logging.getLogger(__name__)
9
-
10
- class LatexConverter:
11
- """Enhanced LaTeX to Markdown converter that handles complex LaTeX structures."""
12
-
13
- @staticmethod
14
- def convert(latex_text: str) -> str:
15
- """
16
- Convert LaTeX text to Markdown, with special handling for tables and other structures.
17
-
18
- Args:
19
- latex_text: Raw LaTeX text from the GOT-OCR model
20
-
21
- Returns:
22
- str: Converted Markdown text
23
- """
24
- if not latex_text or not isinstance(latex_text, str):
25
- return ""
26
-
27
- # Process the text in stages
28
- processed_text = latex_text
29
-
30
- # Stage 1: Pre-process tables before standard conversion
31
- processed_text, tables_dict = LatexConverter._extract_tables(processed_text)
32
-
33
- # Stage 2: Convert using latex2markdown library
34
- try:
35
- # Use the standard latex2markdown library as a base - FOLLOWING OFFICIAL DOCUMENTATION
36
- l2m = latex2markdown.LaTeX2Markdown(processed_text)
37
- processed_text = l2m.to_markdown()
38
- except Exception as e:
39
- logger.error(f"Error in standard latex2markdown conversion: {str(e)}")
40
- # Continue with our custom processing even if the standard library fails
41
-
42
- # Stage 3: Post-process to fix any remaining issues
43
- processed_text = LatexConverter._postprocess_markdown(processed_text)
44
-
45
- # Stage 4: Reinsert tables as markdown tables
46
- processed_text = LatexConverter._reinsert_tables(processed_text, tables_dict)
47
-
48
- return processed_text
49
-
50
- @staticmethod
51
- def _extract_tables(latex_text: str) -> tuple:
52
- """
53
- Extract tables from LaTeX and replace with placeholders.
54
-
55
- Args:
56
- latex_text: Raw LaTeX text
57
-
58
- Returns:
59
- tuple: (processed text with placeholders, dict of tables)
60
- """
61
- processed_text = latex_text
62
- tables_dict = {}
63
-
64
- # Find all tabular environments
65
- table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
66
- tables = re.findall(table_pattern, processed_text, re.DOTALL)
67
-
68
- for i, table_content in enumerate(tables):
69
- placeholder = f"TABLE_PLACEHOLDER_{i}"
70
- tables_dict[placeholder] = table_content
71
-
72
- # Replace the table with a placeholder
73
- processed_text = processed_text.replace(
74
- f"\\begin{{tabular}}{table_content}\\end{{tabular}}",
75
- placeholder
76
- )
77
-
78
- return processed_text, tables_dict
79
-
80
- @staticmethod
81
- def _reinsert_tables(markdown_text: str, tables_dict: dict) -> str:
82
- """
83
- Convert LaTeX tables to Markdown tables and reinsert them.
84
-
85
- Args:
86
- markdown_text: Processed markdown text with placeholders
87
- tables_dict: Dictionary of tables extracted from LaTeX
88
-
89
- Returns:
90
- str: Markdown text with tables converted and reinserted
91
- """
92
- processed_text = markdown_text
93
-
94
- for placeholder, table_content in tables_dict.items():
95
- # Convert LaTeX table to Markdown table
96
- markdown_table = LatexConverter._convert_table_to_markdown(table_content)
97
-
98
- # Replace the placeholder with the Markdown table
99
- processed_text = processed_text.replace(placeholder, markdown_table)
100
-
101
- return processed_text
102
-
103
- @staticmethod
104
- def _convert_table_to_markdown(table_content: str) -> str:
105
- """
106
- Convert a LaTeX table to Markdown format.
107
-
108
- Args:
109
- table_content: LaTeX table content
110
-
111
- Returns:
112
- str: Markdown table
113
- """
114
- # Extract the column specification
115
- col_spec_match = re.search(r'{([^}]*)}', table_content)
116
- if not col_spec_match:
117
- return f"[Table conversion failed]"
118
-
119
- # Process the table content
120
- rows_text = re.sub(r'{[^}]*}', '', table_content, count=1) # Remove the column spec
121
-
122
- # Split into rows by \\ or \hline
123
- rows = re.split(r'\\\\|\\hline', rows_text)
124
- rows = [row.strip() for row in rows if row.strip()]
125
-
126
- if not rows:
127
- return "[Empty table]"
128
-
129
- # Calculate number of columns based on the number of & in the first non-empty row plus 1
130
- num_cols = 1 # Default
131
- for row in rows:
132
- if '&' in row:
133
- num_cols = row.count('&') + 1
134
- break
135
-
136
- # Build markdown table
137
- markdown_table = []
138
-
139
- # Add header row
140
- if rows:
141
- first_row = rows[0]
142
- cells = [cell.strip() for cell in first_row.split('&')]
143
- markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
144
-
145
- # Add separator row
146
- markdown_table.append("| " + " | ".join(["---"] * num_cols) + " |")
147
-
148
- # Add data rows
149
- for row in rows[1:]:
150
- cells = [cell.strip() for cell in row.split('&')]
151
- markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
152
-
153
- return "\n".join(markdown_table)
154
-
155
- @staticmethod
156
- def _postprocess_markdown(markdown_text: str) -> str:
157
- """
158
- Post-process the converted Markdown to fix any remaining issues.
159
-
160
- Args:
161
- markdown_text: Converted Markdown text
162
-
163
- Returns:
164
- str: Post-processed Markdown text
165
- """
166
- processed_text = markdown_text
167
-
168
- # Fix math blocks
169
- processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
170
- processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)
171
-
172
- # Fix formatting issues
173
- processed_text = processed_text.replace('\\textbf{', '**')
174
- processed_text = processed_text.replace('\\textit{', '*')
175
- processed_text = processed_text.replace('}', '') # Remove closing braces
176
-
177
- # Fix escape sequences
178
- processed_text = processed_text.replace('\\%', '%')
179
- processed_text = processed_text.replace('\\$', '$')
180
- processed_text = processed_text.replace('\\&', '&')
181
-
182
- return processed_text
183
-
184
- def convert_latex_to_markdown(latex_text: str) -> str:
185
- """
186
- Convenience function to convert LaTeX to Markdown.
187
-
188
- Args:
189
- latex_text: Raw LaTeX text from the GOT-OCR model
190
-
191
- Returns:
192
- str: Converted Markdown text
193
- """
194
- return LatexConverter.convert(latex_text)