Spaces:

Ansemin101
/

Markit_v2

Runtime error

App Files Files Community

AnseMin commited on Mar 19

Commit

5b7f920

1 Parent(s): 23ad33e

Approach #2 -- converting latex output from GOT OCR to markdown

Browse files

Files changed (7) hide show

app.py +2 -1
requirements.txt +0 -1
setup.sh +1 -1
src/core/converter.py +36 -0
src/core/latex_to_markdown_converter.py +67 -0
src/parsers/gemini_flash_parser.py +2 -1
src/parsers/got_ocr_parser.py +2 -2

app.py CHANGED Viewed

@@ -77,9 +77,10 @@ gemini_api_key = os.getenv("GOOGLE_API_KEY")
 # Check if API key is available and print a message if not
 if not gemini_api_key:
-    print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser may not work.")
 else:
     print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
 # Add the current directory to the Python path
 sys.path.append(current_dir)

 # Check if API key is available and print a message if not
 if not gemini_api_key:
+    print("Warning: GOOGLE_API_KEY environment variable not found. Gemini Flash parser and LaTeX to Markdown conversion may not work.")
 else:
     print(f"Found Gemini API key: {gemini_api_key[:5]}...{gemini_api_key[-5:] if len(gemini_api_key) > 10 else ''}")
+    print("Gemini API will be used for LaTeX to Markdown conversion when using GOT-OCR with Formatted Text mode")
 # Add the current directory to the Python path
 sys.path.append(current_dir)

requirements.txt CHANGED Viewed

@@ -13,7 +13,6 @@ opencv-python  # Match exact dependency from GOT-OCR
 # Utility dependencies
 python-dotenv>=1.0.0
 pydantic==2.7.1
-latex2markdown>=0.1.0  # For LaTeX to Markdown conversion
 # Gemini API client
 google-genai>=0.1.0

 # Utility dependencies
 python-dotenv>=1.0.0
 pydantic==2.7.1
 # Gemini API client
 google-genai>=0.1.0

setup.sh CHANGED Viewed

@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
-pip install -q -U latex2markdown
 echo "Python dependencies installed successfully"
 # Install GOT-OCR transformers dependencies

 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
+# pip install -q -U latex2markdown - removed, now using Gemini API for LaTeX conversion
 echo "Python dependencies installed successfully"
 # Install GOT-OCR transformers dependencies

src/core/converter.py CHANGED Viewed

@@ -10,6 +10,14 @@ from src.core.parser_factory import ParserFactory
 # Import all parsers to ensure they're registered
 from src import parsers
 # Reference to the cancellation flag from ui.py
 # This will be set by the UI when the cancel button is clicked
 conversion_cancelled = None  # Will be a threading.Event object
@@ -133,6 +141,34 @@ def convert_file(file_path, parser_name, ocr_method_name, output_format):
                 safe_delete_file(temp_input)
                 return "Conversion cancelled.", None
         except Exception as e:
             safe_delete_file(temp_input)
             return f"Error: {e}", None

 # Import all parsers to ensure they're registered
 from src import parsers
+# Import the LaTeX to Markdown converter
+try:
+    from src.core.latex_to_markdown_converter import convert_latex_to_markdown
+    HAS_GEMINI_CONVERTER = True
+except ImportError:
+    HAS_GEMINI_CONVERTER = False
+    logging.warning("LaTeX to Markdown converter not available. Raw LaTeX will be returned for formatted text.")
 # Reference to the cancellation flag from ui.py
 # This will be set by the UI when the cancel button is clicked
 conversion_cancelled = None  # Will be a threading.Event object
                 safe_delete_file(temp_input)
                 return "Conversion cancelled.", None
+            # Process LaTeX content for GOT-OCR formatted text
+            if parser_name == "GOT-OCR (jpg,png only)" and ocr_method_name == "Formatted Text" and HAS_GEMINI_CONVERTER:
+                logging.info("Converting LaTeX output to Markdown using Gemini API")
+                start_convert = time.time()
+                # Check for cancellation before conversion
+                if check_cancellation():
+                    logging.info("Cancellation detected before LaTeX conversion")
+                    safe_delete_file(temp_input)
+                    return "Conversion cancelled.", None
+                try:
+                    markdown_content = convert_latex_to_markdown(content)
+                    if markdown_content:
+                        content = markdown_content
+                        logging.info(f"LaTeX conversion completed in {time.time() - start_convert:.2f} seconds")
+                    else:
+                        logging.warning("LaTeX to Markdown conversion failed, using raw LaTeX output")
+                except Exception as e:
+                    logging.error(f"Error converting LaTeX to Markdown: {str(e)}")
+                    # Continue with the original content on error
+                # Check for cancellation after conversion
+                if check_cancellation():
+                    logging.info("Cancellation detected after LaTeX conversion")
+                    safe_delete_file(temp_input)
+                    return "Conversion cancelled.", None
         except Exception as e:
             safe_delete_file(temp_input)
             return f"Error: {e}", None

src/core/latex_to_markdown_converter.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import logging
+from typing import Optional
+from google import genai
+# Configure logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+# Load API key from environment variable
+api_key = os.getenv("GOOGLE_API_KEY")
+# Check if API key is available
+if not api_key:
+    logger.warning("GOOGLE_API_KEY environment variable not found. LaTeX to Markdown conversion may not work.")
+def convert_latex_to_markdown(latex_content: str) -> Optional[str]:
+    """
+    Convert LaTeX content to Markdown using Gemini API.
+    Args:
+        latex_content: The LaTeX content to convert
+    Returns:
+        Converted markdown content or None if conversion fails
+    """
+    if not api_key:
+        logger.error("GOOGLE_API_KEY environment variable not set")
+        return None
+    try:
+        # Create a client
+        client = genai.Client(api_key=api_key)
+        # Set up the prompt
+        prompt = """
+        Convert this LaTeX content to clean, well-formatted Markdown.
+        Preserve all tables, lists, and formatting.
+        For tables, use standard Markdown table syntax.
+        For mathematical expressions, use $ for inline and $$ for display math.
+        Keep the structure and hierarchy of the content. Return only the markdown content, no other text.
+        """
+        # Generate the response
+        response = client.models.generate_content(
+            model="gemini-2.0-flash",
+            contents=[
+                prompt,
+                latex_content
+            ],
+            config={
+                "temperature": 0.1,
+                "top_p": 0.95,
+                "top_k": 40,
+                "max_output_tokens": 8192,
+            }
+        )
+        # Extract the markdown text from the response
+        markdown_text = response.text
+        logger.info("Successfully converted LaTeX to Markdown")
+        return markdown_text
+    except Exception as e:
+        logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
+        return None

src/parsers/gemini_flash_parser.py CHANGED Viewed

@@ -79,6 +79,7 @@ class GeminiFlashParser(DocumentParser):
             Convert this document to markdown format.
             Preserve the structure, headings, lists, tables, and formatting as much as possible.
             For images, include a brief description in markdown image syntax.
             """
             # Generate the response
@@ -92,7 +93,7 @@ class GeminiFlashParser(DocumentParser):
                     )
                 ],
                 config={
-                    "temperature": 0.2,
                     "top_p": 0.95,
                     "top_k": 40,
                     "max_output_tokens": 8192,

             Convert this document to markdown format.
             Preserve the structure, headings, lists, tables, and formatting as much as possible.
             For images, include a brief description in markdown image syntax.
+            Return only the markdown content, no other text.
             """
             # Generate the response
                     )
                 ],
                 config={
+                    "temperature": 0.1,
                     "top_p": 0.95,
                     "top_k": 40,
                     "max_output_tokens": 8192,

src/parsers/got_ocr_parser.py CHANGED Viewed

@@ -17,8 +17,8 @@ import copy
 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
-# Import latex2markdown for conversion
-import latex2markdown
 # Configure logging
 logger = logging.getLogger(__name__)

 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
+# Import latex2markdown for conversion - No longer needed, using Gemini API
+# import latex2markdown
 # Configure logging
 logger = logging.getLogger(__name__)