Spaces:
Running
on
Zero
Running
on
Zero
restore to version 1
Browse files- app.py +2 -18
- requirements.txt +0 -1
- setup.sh +1 -1
- src/core/parser_factory.py +0 -8
- src/parsers/got_ocr_integration.py +0 -41
- src/utils/__init__.py +0 -4
- src/utils/latex_converter.py +0 -194
app.py
CHANGED
@@ -50,24 +50,8 @@ try:
|
|
50 |
import transformers
|
51 |
print(f"Transformers version: {transformers.__version__}")
|
52 |
except ImportError:
|
53 |
-
print("WARNING: Transformers not installed. Installing from GitHub...")
|
54 |
-
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main"], check=False)
|
55 |
-
|
56 |
-
# Check if latex2markdown module is installed (needed for LaTeX conversion)
|
57 |
-
try:
|
58 |
-
import latex2markdown
|
59 |
-
print("LaTeX2Markdown module found for advanced LaTeX conversion")
|
60 |
-
except ImportError:
|
61 |
-
print("WARNING: LaTeX2Markdown module not found. Installing...")
|
62 |
-
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "latex2markdown"], check=False)
|
63 |
-
|
64 |
-
# Check if regex module is installed (needed for LaTeX conversion)
|
65 |
-
try:
|
66 |
-
import regex
|
67 |
-
print(f"Regex module found: {regex.__version__ if hasattr(regex, '__version__') else 'version unknown'}")
|
68 |
-
except ImportError:
|
69 |
-
print("WARNING: Regex module not found. Installing...")
|
70 |
-
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "regex>=2023.0.0"], check=False)
|
71 |
|
72 |
# Check if numpy is installed with the correct version
|
73 |
try:
|
|
|
50 |
import transformers
|
51 |
print(f"Transformers version: {transformers.__version__}")
|
52 |
except ImportError:
|
53 |
+
print("WARNING: Transformers not installed. Installing transformers from GitHub...")
|
54 |
+
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main", "accelerate", "verovio"], check=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
# Check if numpy is installed with the correct version
|
57 |
try:
|
requirements.txt
CHANGED
@@ -14,7 +14,6 @@ opencv-python # Match exact dependency from GOT-OCR
|
|
14 |
python-dotenv>=1.0.0
|
15 |
pydantic==2.7.1
|
16 |
latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
|
17 |
-
regex>=2023.0.0 # For advanced regex pattern matching
|
18 |
|
19 |
# Gemini API client
|
20 |
google-genai>=0.1.0
|
|
|
14 |
python-dotenv>=1.0.0
|
15 |
pydantic==2.7.1
|
16 |
latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
|
|
|
17 |
|
18 |
# Gemini API client
|
19 |
google-genai>=0.1.0
|
setup.sh
CHANGED
@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
|
|
29 |
echo "Installing Python dependencies..."
|
30 |
pip install -q -U pillow opencv-python
|
31 |
pip install -q -U google-genai
|
32 |
-
pip install -q -U latex2markdown
|
33 |
echo "Python dependencies installed successfully"
|
34 |
|
35 |
# Install GOT-OCR transformers dependencies
|
|
|
29 |
echo "Installing Python dependencies..."
|
30 |
pip install -q -U pillow opencv-python
|
31 |
pip install -q -U google-genai
|
32 |
+
pip install -q -U latex2markdown
|
33 |
echo "Python dependencies installed successfully"
|
34 |
|
35 |
# Install GOT-OCR transformers dependencies
|
src/core/parser_factory.py
CHANGED
@@ -7,9 +7,6 @@ import time
|
|
7 |
from src.parsers.parser_interface import DocumentParser
|
8 |
from src.parsers.parser_registry import ParserRegistry
|
9 |
|
10 |
-
# Import the GOT-OCR integration module for post-processing
|
11 |
-
from src.parsers.got_ocr_integration import process_got_ocr_output
|
12 |
-
|
13 |
|
14 |
class ParserFactory:
|
15 |
"""Factory for creating parser instances."""
|
@@ -94,10 +91,5 @@ class ParserFactory:
|
|
94 |
# Check one more time after parsing completes
|
95 |
if check_cancellation():
|
96 |
return "Conversion cancelled."
|
97 |
-
|
98 |
-
# Post-process the result for GOT-OCR parser
|
99 |
-
if "GOT-OCR" in parser_name:
|
100 |
-
logging.info(f"Post-processing GOT-OCR output for {ocr_method_name}")
|
101 |
-
result = process_got_ocr_output(result, ocr_method_name, output_format)
|
102 |
|
103 |
return result
|
|
|
7 |
from src.parsers.parser_interface import DocumentParser
|
8 |
from src.parsers.parser_registry import ParserRegistry
|
9 |
|
|
|
|
|
|
|
10 |
|
11 |
class ParserFactory:
|
12 |
"""Factory for creating parser instances."""
|
|
|
91 |
# Check one more time after parsing completes
|
92 |
if check_cancellation():
|
93 |
return "Conversion cancelled."
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
return result
|
src/parsers/got_ocr_integration.py
DELETED
@@ -1,41 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
from typing import Optional, Dict, Any
|
3 |
-
import os
|
4 |
-
from pathlib import Path
|
5 |
-
|
6 |
-
# Import the LaTeX converter utility
|
7 |
-
from src.utils.latex_converter import convert_latex_to_markdown
|
8 |
-
|
9 |
-
# Configure logging
|
10 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
11 |
-
logger = logging.getLogger(__name__)
|
12 |
-
|
13 |
-
def process_got_ocr_output(output_text: str, ocr_method: str, output_format: str) -> str:
|
14 |
-
"""
|
15 |
-
Process the output from GOT-OCR parser and convert if needed.
|
16 |
-
|
17 |
-
Args:
|
18 |
-
output_text: The raw output text from the GOT-OCR parser
|
19 |
-
ocr_method: The OCR method used (Plain Text, Formatted Text)
|
20 |
-
output_format: The desired output format (Markdown, etc.)
|
21 |
-
|
22 |
-
Returns:
|
23 |
-
str: The processed text
|
24 |
-
"""
|
25 |
-
if not output_text:
|
26 |
-
return ""
|
27 |
-
|
28 |
-
# If not using formatted text or not requesting Markdown, return the original text
|
29 |
-
if ocr_method.lower() != "formatted text" or output_format.lower() != "markdown":
|
30 |
-
return output_text
|
31 |
-
|
32 |
-
# Process the formatted text (LaTeX) into enhanced Markdown
|
33 |
-
logger.info("Converting LaTeX output to enhanced Markdown format")
|
34 |
-
try:
|
35 |
-
markdown_text = convert_latex_to_markdown(output_text)
|
36 |
-
logger.info("LaTeX to Markdown conversion successful")
|
37 |
-
return markdown_text
|
38 |
-
except Exception as e:
|
39 |
-
logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
|
40 |
-
# Return the original text if conversion fails
|
41 |
-
return output_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/utils/__init__.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Utilities package for Markit.
|
3 |
-
Contains shared utility functions and helper modules.
|
4 |
-
"""
|
|
|
|
|
|
|
|
|
|
src/utils/latex_converter.py
DELETED
@@ -1,194 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import logging
|
3 |
-
from typing import Dict, List, Tuple, Optional
|
4 |
-
import latex2markdown
|
5 |
-
|
6 |
-
# Configure logging
|
7 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
8 |
-
logger = logging.getLogger(__name__)
|
9 |
-
|
10 |
-
class LatexConverter:
|
11 |
-
"""Enhanced LaTeX to Markdown converter that handles complex LaTeX structures."""
|
12 |
-
|
13 |
-
@staticmethod
|
14 |
-
def convert(latex_text: str) -> str:
|
15 |
-
"""
|
16 |
-
Convert LaTeX text to Markdown, with special handling for tables and other structures.
|
17 |
-
|
18 |
-
Args:
|
19 |
-
latex_text: Raw LaTeX text from the GOT-OCR model
|
20 |
-
|
21 |
-
Returns:
|
22 |
-
str: Converted Markdown text
|
23 |
-
"""
|
24 |
-
if not latex_text or not isinstance(latex_text, str):
|
25 |
-
return ""
|
26 |
-
|
27 |
-
# Process the text in stages
|
28 |
-
processed_text = latex_text
|
29 |
-
|
30 |
-
# Stage 1: Pre-process tables before standard conversion
|
31 |
-
processed_text, tables_dict = LatexConverter._extract_tables(processed_text)
|
32 |
-
|
33 |
-
# Stage 2: Convert using latex2markdown library
|
34 |
-
try:
|
35 |
-
# Use the standard latex2markdown library as a base - FOLLOWING OFFICIAL DOCUMENTATION
|
36 |
-
l2m = latex2markdown.LaTeX2Markdown(processed_text)
|
37 |
-
processed_text = l2m.to_markdown()
|
38 |
-
except Exception as e:
|
39 |
-
logger.error(f"Error in standard latex2markdown conversion: {str(e)}")
|
40 |
-
# Continue with our custom processing even if the standard library fails
|
41 |
-
|
42 |
-
# Stage 3: Post-process to fix any remaining issues
|
43 |
-
processed_text = LatexConverter._postprocess_markdown(processed_text)
|
44 |
-
|
45 |
-
# Stage 4: Reinsert tables as markdown tables
|
46 |
-
processed_text = LatexConverter._reinsert_tables(processed_text, tables_dict)
|
47 |
-
|
48 |
-
return processed_text
|
49 |
-
|
50 |
-
@staticmethod
|
51 |
-
def _extract_tables(latex_text: str) -> tuple:
|
52 |
-
"""
|
53 |
-
Extract tables from LaTeX and replace with placeholders.
|
54 |
-
|
55 |
-
Args:
|
56 |
-
latex_text: Raw LaTeX text
|
57 |
-
|
58 |
-
Returns:
|
59 |
-
tuple: (processed text with placeholders, dict of tables)
|
60 |
-
"""
|
61 |
-
processed_text = latex_text
|
62 |
-
tables_dict = {}
|
63 |
-
|
64 |
-
# Find all tabular environments
|
65 |
-
table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
|
66 |
-
tables = re.findall(table_pattern, processed_text, re.DOTALL)
|
67 |
-
|
68 |
-
for i, table_content in enumerate(tables):
|
69 |
-
placeholder = f"TABLE_PLACEHOLDER_{i}"
|
70 |
-
tables_dict[placeholder] = table_content
|
71 |
-
|
72 |
-
# Replace the table with a placeholder
|
73 |
-
processed_text = processed_text.replace(
|
74 |
-
f"\\begin{{tabular}}{table_content}\\end{{tabular}}",
|
75 |
-
placeholder
|
76 |
-
)
|
77 |
-
|
78 |
-
return processed_text, tables_dict
|
79 |
-
|
80 |
-
@staticmethod
|
81 |
-
def _reinsert_tables(markdown_text: str, tables_dict: dict) -> str:
|
82 |
-
"""
|
83 |
-
Convert LaTeX tables to Markdown tables and reinsert them.
|
84 |
-
|
85 |
-
Args:
|
86 |
-
markdown_text: Processed markdown text with placeholders
|
87 |
-
tables_dict: Dictionary of tables extracted from LaTeX
|
88 |
-
|
89 |
-
Returns:
|
90 |
-
str: Markdown text with tables converted and reinserted
|
91 |
-
"""
|
92 |
-
processed_text = markdown_text
|
93 |
-
|
94 |
-
for placeholder, table_content in tables_dict.items():
|
95 |
-
# Convert LaTeX table to Markdown table
|
96 |
-
markdown_table = LatexConverter._convert_table_to_markdown(table_content)
|
97 |
-
|
98 |
-
# Replace the placeholder with the Markdown table
|
99 |
-
processed_text = processed_text.replace(placeholder, markdown_table)
|
100 |
-
|
101 |
-
return processed_text
|
102 |
-
|
103 |
-
@staticmethod
|
104 |
-
def _convert_table_to_markdown(table_content: str) -> str:
|
105 |
-
"""
|
106 |
-
Convert a LaTeX table to Markdown format.
|
107 |
-
|
108 |
-
Args:
|
109 |
-
table_content: LaTeX table content
|
110 |
-
|
111 |
-
Returns:
|
112 |
-
str: Markdown table
|
113 |
-
"""
|
114 |
-
# Extract the column specification
|
115 |
-
col_spec_match = re.search(r'{([^}]*)}', table_content)
|
116 |
-
if not col_spec_match:
|
117 |
-
return f"[Table conversion failed]"
|
118 |
-
|
119 |
-
# Process the table content
|
120 |
-
rows_text = re.sub(r'{[^}]*}', '', table_content, count=1) # Remove the column spec
|
121 |
-
|
122 |
-
# Split into rows by \\ or \hline
|
123 |
-
rows = re.split(r'\\\\|\\hline', rows_text)
|
124 |
-
rows = [row.strip() for row in rows if row.strip()]
|
125 |
-
|
126 |
-
if not rows:
|
127 |
-
return "[Empty table]"
|
128 |
-
|
129 |
-
# Calculate number of columns based on the number of & in the first non-empty row plus 1
|
130 |
-
num_cols = 1 # Default
|
131 |
-
for row in rows:
|
132 |
-
if '&' in row:
|
133 |
-
num_cols = row.count('&') + 1
|
134 |
-
break
|
135 |
-
|
136 |
-
# Build markdown table
|
137 |
-
markdown_table = []
|
138 |
-
|
139 |
-
# Add header row
|
140 |
-
if rows:
|
141 |
-
first_row = rows[0]
|
142 |
-
cells = [cell.strip() for cell in first_row.split('&')]
|
143 |
-
markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
|
144 |
-
|
145 |
-
# Add separator row
|
146 |
-
markdown_table.append("| " + " | ".join(["---"] * num_cols) + " |")
|
147 |
-
|
148 |
-
# Add data rows
|
149 |
-
for row in rows[1:]:
|
150 |
-
cells = [cell.strip() for cell in row.split('&')]
|
151 |
-
markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
|
152 |
-
|
153 |
-
return "\n".join(markdown_table)
|
154 |
-
|
155 |
-
@staticmethod
|
156 |
-
def _postprocess_markdown(markdown_text: str) -> str:
|
157 |
-
"""
|
158 |
-
Post-process the converted Markdown to fix any remaining issues.
|
159 |
-
|
160 |
-
Args:
|
161 |
-
markdown_text: Converted Markdown text
|
162 |
-
|
163 |
-
Returns:
|
164 |
-
str: Post-processed Markdown text
|
165 |
-
"""
|
166 |
-
processed_text = markdown_text
|
167 |
-
|
168 |
-
# Fix math blocks
|
169 |
-
processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
|
170 |
-
processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)
|
171 |
-
|
172 |
-
# Fix formatting issues
|
173 |
-
processed_text = processed_text.replace('\\textbf{', '**')
|
174 |
-
processed_text = processed_text.replace('\\textit{', '*')
|
175 |
-
processed_text = processed_text.replace('}', '') # Remove closing braces
|
176 |
-
|
177 |
-
# Fix escape sequences
|
178 |
-
processed_text = processed_text.replace('\\%', '%')
|
179 |
-
processed_text = processed_text.replace('\\$', '$')
|
180 |
-
processed_text = processed_text.replace('\\&', '&')
|
181 |
-
|
182 |
-
return processed_text
|
183 |
-
|
184 |
-
def convert_latex_to_markdown(latex_text: str) -> str:
|
185 |
-
"""
|
186 |
-
Convenience function to convert LaTeX to Markdown.
|
187 |
-
|
188 |
-
Args:
|
189 |
-
latex_text: Raw LaTeX text from the GOT-OCR model
|
190 |
-
|
191 |
-
Returns:
|
192 |
-
str: Converted Markdown text
|
193 |
-
"""
|
194 |
-
return LatexConverter.convert(latex_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|