Spaces:
Runtime error
Runtime error
Approach #2 -- converting latex output from GOT OCR to markdown
Browse files- app.py +2 -1
- requirements.txt +0 -1
- setup.sh +1 -1
- src/core/converter.py +36 -0
- src/core/latex_to_markdown_converter.py +67 -0
- src/parsers/gemini_flash_parser.py +2 -1
- src/parsers/got_ocr_parser.py +2 -2
app.py
CHANGED
@@ -77,9 +77,10 @@ gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
|
77 |
|
78 |
# Check if API key is available and print a message if not
|
79 |
if not gemini_api_key:
|
80 |
-
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
|
81 |
else:
|
82 |
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
|
|
83 |
|
84 |
# Add the current directory to the Python path
|
85 |
sys.path.append(current_dir)
|
|
|
77 |
|
78 |
# Check if API key is available and print a message if not
|
79 |
if not gemini_api_key:
|
80 |
+
print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser and LaTeX to Markdown conversion may not work.")
|
81 |
else:
|
82 |
print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
|
83 |
+
print("Gemini API will be used for LaTeX to Markdown conversion when using GOT-OCR with Formatted Text mode")
|
84 |
|
85 |
# Add the current directory to the Python path
|
86 |
sys.path.append(current_dir)
|
requirements.txt
CHANGED
@@ -13,7 +13,6 @@ opencv-python # Match exact dependency from GOT-OCR
|
|
13 |
# Utility dependencies
|
14 |
python-dotenv>=1.0.0
|
15 |
pydantic==2.7.1
|
16 |
-
latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
|
17 |
|
18 |
# Gemini API client
|
19 |
google-genai>=0.1.0
|
|
|
13 |
# Utility dependencies
|
14 |
python-dotenv>=1.0.0
|
15 |
pydantic==2.7.1
|
|
|
16 |
|
17 |
# Gemini API client
|
18 |
google-genai>=0.1.0
|
setup.sh
CHANGED
@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
|
|
29 |
echo "Installing Python dependencies..."
|
30 |
pip install -q -U pillow opencv-python
|
31 |
pip install -q -U google-genai
|
32 |
-
pip install -q -U latex2markdown
|
33 |
echo "Python dependencies installed successfully"
|
34 |
|
35 |
# Install GOT-OCR transformers dependencies
|
|
|
29 |
echo "Installing Python dependencies..."
|
30 |
pip install -q -U pillow opencv-python
|
31 |
pip install -q -U google-genai
|
32 |
+
# pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
|
33 |
echo "Python dependencies installed successfully"
|
34 |
|
35 |
# Install GOT-OCR transformers dependencies
|
src/core/converter.py
CHANGED
@@ -10,6 +10,14 @@ from src.core.parser_factory import ParserFactory
|
|
10 |
# Import all parsers to ensure they're registered
|
11 |
from src import parsers
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# Reference to the cancellation flag from ui.py
|
14 |
# This will be set by the UI when the cancel button is clicked
|
15 |
conversion_cancelled = None # Will be a threading.Event object
|
@@ -133,6 +141,34 @@ def convert_file(file_path, parser_name, ocr_method_name, output_format):
|
|
133 |
safe_delete_file(temp_input)
|
134 |
return "Conversion cancelled.", None
|
135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
except Exception as e:
|
137 |
safe_delete_file(temp_input)
|
138 |
return f"Error: {e}", None
|
|
|
10 |
# Import all parsers to ensure they're registered
|
11 |
from src import parsers
|
12 |
|
13 |
+
# Import the LaTeX to Markdown converter
|
14 |
+
try:
|
15 |
+
from src.core.latex_to_markdown_converter import convert_latex_to_markdown
|
16 |
+
HAS_GEMINI_CONVERTER = True
|
17 |
+
except ImportError:
|
18 |
+
HAS_GEMINI_CONVERTER = False
|
19 |
+
logging.warning("LaTeX to Markdown converter not available. Raw LaTeX will be returned for formatted text.")
|
20 |
+
|
21 |
# Reference to the cancellation flag from ui.py
|
22 |
# This will be set by the UI when the cancel button is clicked
|
23 |
conversion_cancelled = None # Will be a threading.Event object
|
|
|
141 |
safe_delete_file(temp_input)
|
142 |
return "Conversion cancelled.", None
|
143 |
|
144 |
+
# Process LaTeX content for GOT-OCR formatted text
|
145 |
+
if parser_name == "GOT-OCR (jpg,png only)" and ocr_method_name == "Formatted Text" and HAS_GEMINI_CONVERTER:
|
146 |
+
logging.info("Converting LaTeX output to Markdown using Gemini API")
|
147 |
+
start_convert = time.time()
|
148 |
+
|
149 |
+
# Check for cancellation before conversion
|
150 |
+
if check_cancellation():
|
151 |
+
logging.info("Cancellation detected before LaTeX conversion")
|
152 |
+
safe_delete_file(temp_input)
|
153 |
+
return "Conversion cancelled.", None
|
154 |
+
|
155 |
+
try:
|
156 |
+
markdown_content = convert_latex_to_markdown(content)
|
157 |
+
if markdown_content:
|
158 |
+
content = markdown_content
|
159 |
+
logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
|
160 |
+
else:
|
161 |
+
logging.warning("LaTeX to Markdown conversion failed, using raw LaTeX output")
|
162 |
+
except Exception as e:
|
163 |
+
logging.error(f"Error converting LaTeX to Markdown: {str(e)}")
|
164 |
+
# Continue with the original content on error
|
165 |
+
|
166 |
+
# Check for cancellation after conversion
|
167 |
+
if check_cancellation():
|
168 |
+
logging.info("Cancellation detected after LaTeX conversion")
|
169 |
+
safe_delete_file(temp_input)
|
170 |
+
return "Conversion cancelled.", None
|
171 |
+
|
172 |
except Exception as e:
|
173 |
safe_delete_file(temp_input)
|
174 |
return f"Error: {e}", None
|
src/core/latex_to_markdown_converter.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from typing import Optional
|
4 |
+
from google import genai
|
5 |
+
|
6 |
+
# Configure logging
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
logger.setLevel(logging.DEBUG)
|
9 |
+
|
10 |
+
# Load API key from environment variable
|
11 |
+
api_key = os.getenv("GOOGLE_API_KEY")
|
12 |
+
|
13 |
+
# Check if API key is available
|
14 |
+
if not api_key:
|
15 |
+
logger.warning("GOOGLE_API_KEY environment variable not found. LaTeX to Markdown conversion may not work.")
|
16 |
+
|
17 |
+
def convert_latex_to_markdown(latex_content: str) -> Optional[str]:
|
18 |
+
"""
|
19 |
+
Convert LaTeX content to Markdown using Gemini API.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
latex_content: The LaTeX content to convert
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
Converted markdown content or None if conversion fails
|
26 |
+
"""
|
27 |
+
if not api_key:
|
28 |
+
logger.error("GOOGLE_API_KEY environment variable not set")
|
29 |
+
return None
|
30 |
+
|
31 |
+
try:
|
32 |
+
# Create a client
|
33 |
+
client = genai.Client(api_key=api_key)
|
34 |
+
|
35 |
+
# Set up the prompt
|
36 |
+
prompt = """
|
37 |
+
Convert this LaTeX content to clean, well-formatted Markdown.
|
38 |
+
Preserve all tables, lists, and formatting.
|
39 |
+
For tables, use standard Markdown table syntax.
|
40 |
+
For mathematical expressions, use $ for inline and $$ for display math.
|
41 |
+
Keep the structure and hierarchy of the content. Return only the markdown content, no other text.
|
42 |
+
"""
|
43 |
+
|
44 |
+
# Generate the response
|
45 |
+
response = client.models.generate_content(
|
46 |
+
model="gemini-2.0-flash",
|
47 |
+
contents=[
|
48 |
+
prompt,
|
49 |
+
latex_content
|
50 |
+
],
|
51 |
+
config={
|
52 |
+
"temperature": 0.1,
|
53 |
+
"top_p": 0.95,
|
54 |
+
"top_k": 40,
|
55 |
+
"max_output_tokens": 8192,
|
56 |
+
}
|
57 |
+
)
|
58 |
+
|
59 |
+
# Extract the markdown text from the response
|
60 |
+
markdown_text = response.text
|
61 |
+
|
62 |
+
logger.info("Successfully converted LaTeX to Markdown")
|
63 |
+
return markdown_text
|
64 |
+
|
65 |
+
except Exception as e:
|
66 |
+
logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
|
67 |
+
return None
|
src/parsers/gemini_flash_parser.py
CHANGED
@@ -79,6 +79,7 @@ class GeminiFlashParser(DocumentParser):
|
|
79 |
Convert this document to markdown format.
|
80 |
Preserve the structure, headings, lists, tables, and formatting as much as possible.
|
81 |
For images, include a brief description in markdown image syntax.
|
|
|
82 |
"""
|
83 |
|
84 |
# Generate the response
|
@@ -92,7 +93,7 @@ class GeminiFlashParser(DocumentParser):
|
|
92 |
)
|
93 |
],
|
94 |
config={
|
95 |
-
"temperature": 0.
|
96 |
"top_p": 0.95,
|
97 |
"top_k": 40,
|
98 |
"max_output_tokens": 8192,
|
|
|
79 |
Convert this document to markdown format.
|
80 |
Preserve the structure, headings, lists, tables, and formatting as much as possible.
|
81 |
For images, include a brief description in markdown image syntax.
|
82 |
+
Return only the markdown content, no other text.
|
83 |
"""
|
84 |
|
85 |
# Generate the response
|
|
|
93 |
)
|
94 |
],
|
95 |
config={
|
96 |
+
"temperature": 0.1,
|
97 |
"top_p": 0.95,
|
98 |
"top_k": 40,
|
99 |
"max_output_tokens": 8192,
|
src/parsers/got_ocr_parser.py
CHANGED
@@ -17,8 +17,8 @@ import copy
|
|
17 |
from src.parsers.parser_interface import DocumentParser
|
18 |
from src.parsers.parser_registry import ParserRegistry
|
19 |
|
20 |
-
# Import latex2markdown for conversion
|
21 |
-
import latex2markdown
|
22 |
|
23 |
# Configure logging
|
24 |
logger = logging.getLogger(__name__)
|
|
|
17 |
from src.parsers.parser_interface import DocumentParser
|
18 |
from src.parsers.parser_registry import ParserRegistry
|
19 |
|
20 |
+
# Import latex2markdown for conversion - No longer needed, using Gemini API
|
21 |
+
# import latex2markdown
|
22 |
|
23 |
# Configure logging
|
24 |
logger = logging.getLogger(__name__)
|