AnseMin commited on
Commit
5b7f920
·
1 Parent(s): 23ad33e

Approach #2 -- converting latex output from GOT OCR to markdown

Browse files
app.py CHANGED
@@ -77,9 +77,10 @@ gemini_api_key = os.getenv("GOOGLE_API_KEY")
77
 
78
  # Check if API key is available and print a message if not
79
  if not gemini_api_key:
80
- print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
81
  else:
82
  print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
 
83
 
84
  # Add the current directory to the Python path
85
  sys.path.append(current_dir)
 
77
 
78
  # Check if API key is available and print a message if not
79
  if not gemini_api_key:
80
+ print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser and LaTeX to Markdown conversion may not work.")
81
  else:
82
  print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
83
+ print("Gemini API will be used for LaTeX to Markdown conversion when using GOT-OCR with Formatted Text mode")
84
 
85
  # Add the current directory to the Python path
86
  sys.path.append(current_dir)
requirements.txt CHANGED
@@ -13,7 +13,6 @@ opencv-python # Match exact dependency from GOT-OCR
13
  # Utility dependencies
14
  python-dotenv>=1.0.0
15
  pydantic==2.7.1
16
- latex2markdown>=0.1.0 # For LaTeX to Markdown conversion
17
 
18
  # Gemini API client
19
  google-genai>=0.1.0
 
13
  # Utility dependencies
14
  python-dotenv>=1.0.0
15
  pydantic==2.7.1
 
16
 
17
  # Gemini API client
18
  google-genai>=0.1.0
setup.sh CHANGED
@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
29
  echo "Installing Python dependencies..."
30
  pip install -q -U pillow opencv-python
31
  pip install -q -U google-genai
32
- pip install -q -U latex2markdown
33
  echo "Python dependencies installed successfully"
34
 
35
  # Install GOT-OCR transformers dependencies
 
29
  echo "Installing Python dependencies..."
30
  pip install -q -U pillow opencv-python
31
  pip install -q -U google-genai
32
+ # pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
33
  echo "Python dependencies installed successfully"
34
 
35
  # Install GOT-OCR transformers dependencies
src/core/converter.py CHANGED
@@ -10,6 +10,14 @@ from src.core.parser_factory import ParserFactory
10
  # Import all parsers to ensure they're registered
11
  from src import parsers
12
 
 
 
 
 
 
 
 
 
13
  # Reference to the cancellation flag from ui.py
14
  # This will be set by the UI when the cancel button is clicked
15
  conversion_cancelled = None # Will be a threading.Event object
@@ -133,6 +141,34 @@ def convert_file(file_path, parser_name, ocr_method_name, output_format):
133
  safe_delete_file(temp_input)
134
  return "Conversion cancelled.", None
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  except Exception as e:
137
  safe_delete_file(temp_input)
138
  return f"Error: {e}", None
 
10
  # Import all parsers to ensure they're registered
11
  from src import parsers
12
 
13
+ # Import the LaTeX to Markdown converter
14
+ try:
15
+ from src.core.latex_to_markdown_converter import convert_latex_to_markdown
16
+ HAS_GEMINI_CONVERTER = True
17
+ except ImportError:
18
+ HAS_GEMINI_CONVERTER = False
19
+ logging.warning("LaTeX to Markdown converter not available. Raw LaTeX will be returned for formatted text.")
20
+
21
  # Reference to the cancellation flag from ui.py
22
  # This will be set by the UI when the cancel button is clicked
23
  conversion_cancelled = None # Will be a threading.Event object
 
141
  safe_delete_file(temp_input)
142
  return "Conversion cancelled.", None
143
 
144
+ # Process LaTeX content for GOT-OCR formatted text
145
+ if parser_name == "GOT-OCR (jpg,png only)" and ocr_method_name == "Formatted Text" and HAS_GEMINI_CONVERTER:
146
+ logging.info("Converting LaTeX output to Markdown using Gemini API")
147
+ start_convert = time.time()
148
+
149
+ # Check for cancellation before conversion
150
+ if check_cancellation():
151
+ logging.info("Cancellation detected before LaTeX conversion")
152
+ safe_delete_file(temp_input)
153
+ return "Conversion cancelled.", None
154
+
155
+ try:
156
+ markdown_content = convert_latex_to_markdown(content)
157
+ if markdown_content:
158
+ content = markdown_content
159
+ logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
160
+ else:
161
+ logging.warning("LaTeX to Markdown conversion failed, using raw LaTeX output")
162
+ except Exception as e:
163
+ logging.error(f"Error converting LaTeX to Markdown: {str(e)}")
164
+ # Continue with the original content on error
165
+
166
+ # Check for cancellation after conversion
167
+ if check_cancellation():
168
+ logging.info("Cancellation detected after LaTeX conversion")
169
+ safe_delete_file(temp_input)
170
+ return "Conversion cancelled.", None
171
+
172
  except Exception as e:
173
  safe_delete_file(temp_input)
174
  return f"Error: {e}", None
src/core/latex_to_markdown_converter.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from typing import Optional
4
+ from google import genai
5
+
6
+ # Configure logging
7
+ logger = logging.getLogger(__name__)
8
+ logger.setLevel(logging.DEBUG)
9
+
10
+ # Load API key from environment variable
11
+ api_key = os.getenv("GOOGLE_API_KEY")
12
+
13
+ # Check if API key is available
14
+ if not api_key:
15
+ logger.warning("GOOGLE_API_KEY environment variable not found. LaTeX to Markdown conversion may not work.")
16
+
17
+ def convert_latex_to_markdown(latex_content: str) -> Optional[str]:
18
+ """
19
+ Convert LaTeX content to Markdown using Gemini API.
20
+
21
+ Args:
22
+ latex_content: The LaTeX content to convert
23
+
24
+ Returns:
25
+ Converted markdown content or None if conversion fails
26
+ """
27
+ if not api_key:
28
+ logger.error("GOOGLE_API_KEY environment variable not set")
29
+ return None
30
+
31
+ try:
32
+ # Create a client
33
+ client = genai.Client(api_key=api_key)
34
+
35
+ # Set up the prompt
36
+ prompt = """
37
+ Convert this LaTeX content to clean, well-formatted Markdown.
38
+ Preserve all tables, lists, and formatting.
39
+ For tables, use standard Markdown table syntax.
40
+ For mathematical expressions, use $ for inline and $$ for display math.
41
+ Keep the structure and hierarchy of the content. Return only the markdown content, no other text.
42
+ """
43
+
44
+ # Generate the response
45
+ response = client.models.generate_content(
46
+ model="gemini-2.0-flash",
47
+ contents=[
48
+ prompt,
49
+ latex_content
50
+ ],
51
+ config={
52
+ "temperature": 0.1,
53
+ "top_p": 0.95,
54
+ "top_k": 40,
55
+ "max_output_tokens": 8192,
56
+ }
57
+ )
58
+
59
+ # Extract the markdown text from the response
60
+ markdown_text = response.text
61
+
62
+ logger.info("Successfully converted LaTeX to Markdown")
63
+ return markdown_text
64
+
65
+ except Exception as e:
66
+ logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
67
+ return None
src/parsers/gemini_flash_parser.py CHANGED
@@ -79,6 +79,7 @@ class GeminiFlashParser(DocumentParser):
79
  Convert this document to markdown format.
80
  Preserve the structure, headings, lists, tables, and formatting as much as possible.
81
  For images, include a brief description in markdown image syntax.
 
82
  """
83
 
84
  # Generate the response
@@ -92,7 +93,7 @@ class GeminiFlashParser(DocumentParser):
92
  )
93
  ],
94
  config={
95
- "temperature": 0.2,
96
  "top_p": 0.95,
97
  "top_k": 40,
98
  "max_output_tokens": 8192,
 
79
  Convert this document to markdown format.
80
  Preserve the structure, headings, lists, tables, and formatting as much as possible.
81
  For images, include a brief description in markdown image syntax.
82
+ Return only the markdown content, no other text.
83
  """
84
 
85
  # Generate the response
 
93
  )
94
  ],
95
  config={
96
+ "temperature": 0.1,
97
  "top_p": 0.95,
98
  "top_k": 40,
99
  "max_output_tokens": 8192,
src/parsers/got_ocr_parser.py CHANGED
@@ -17,8 +17,8 @@ import copy
17
  from src.parsers.parser_interface import DocumentParser
18
  from src.parsers.parser_registry import ParserRegistry
19
 
20
- # Import latex2markdown for conversion
21
- import latex2markdown
22
 
23
  # Configure logging
24
  logger = logging.getLogger(__name__)
 
17
  from src.parsers.parser_interface import DocumentParser
18
  from src.parsers.parser_registry import ParserRegistry
19
 
20
+ # Import latex2markdown for conversion - No longer needed, using Gemini API
21
+ # import latex2markdown
22
 
23
  # Configure logging
24
  logger = logging.getLogger(__name__)