Spaces:

Ansemin101
/

Markit_v2

Runtime error

App Files Files Community

AnseMin commited on Mar 19

Commit

33f1b65

1 Parent(s): 5910e0d

Latex2Markdown display changes --attemtp1

Browse files

Files changed (8) hide show

app.py +18 -2
requirements.txt +1 -0
setup.sh +1 -1
src/core/parser_factory.py +8 -0
src/parsers/got_ocr_integration.py +41 -0
src/parsers/got_ocr_parser.py +7 -6
src/utils/__init__.py +4 -0
src/utils/latex_converter.py +174 -0

app.py CHANGED Viewed

@@ -50,8 +50,24 @@ try:
     import transformers
     print(f"Transformers version: {transformers.__version__}")
 except ImportError:
-    print("WARNING: Transformers not installed. Installing transformers from GitHub...")
-    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main", "accelerate", "verovio"], check=False)
 # Check if numpy is installed with the correct version
 try:

     import transformers
     print(f"Transformers version: {transformers.__version__}")
 except ImportError:
+    print("WARNING: Transformers not installed. Installing from GitHub...")
+    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main"], check=False)
+# Check if latex2markdown module is installed (needed for LaTeX conversion)
+try:
+    import latex2markdown
+    print("LaTeX2Markdown module found for advanced LaTeX conversion")
+except ImportError:
+    print("WARNING: LaTeX2Markdown module not found. Installing...")
+    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "latex2markdown"], check=False)
+# Check if regex module is installed (needed for LaTeX conversion)
+try:
+    import regex
+    print(f"Regex module found: {regex.__version__ if hasattr(regex, '__version__') else 'version unknown'}")
+except ImportError:
+    print("WARNING: Regex module not found. Installing...")
+    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "regex>=2023.0.0"], check=False)
 # Check if numpy is installed with the correct version
 try:

requirements.txt CHANGED Viewed

@@ -14,6 +14,7 @@ opencv-python  # Match exact dependency from GOT-OCR
 python-dotenv>=1.0.0
 pydantic==2.7.1
 latex2markdown>=0.1.0  # For LaTeX to Markdown conversion
 # Gemini API client
 google-genai>=0.1.0

 python-dotenv>=1.0.0
 pydantic==2.7.1
 latex2markdown>=0.1.0  # For LaTeX to Markdown conversion
+regex>=2023.0.0  # For advanced regex pattern matching
 # Gemini API client
 google-genai>=0.1.0

setup.sh CHANGED Viewed

@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
-pip install -q -U latex2markdown
 echo "Python dependencies installed successfully"
 # Install GOT-OCR transformers dependencies

 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
+pip install -q -U latex2markdown regex>=2023.0.0
 echo "Python dependencies installed successfully"
 # Install GOT-OCR transformers dependencies

src/core/parser_factory.py CHANGED Viewed

@@ -7,6 +7,9 @@ import time
 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
 class ParserFactory:
     """Factory for creating parser instances."""
@@ -91,5 +94,10 @@ class ParserFactory:
         # Check one more time after parsing completes
         if check_cancellation():
             return "Conversion cancelled."
         return result

 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
+# Import the GOT-OCR integration module for post-processing
+from src.parsers.got_ocr_integration import process_got_ocr_output
 class ParserFactory:
     """Factory for creating parser instances."""
         # Check one more time after parsing completes
         if check_cancellation():
             return "Conversion cancelled."
+        # Post-process the result for GOT-OCR parser
+        if "GOT-OCR" in parser_name:
+            logging.info(f"Post-processing GOT-OCR output for {ocr_method_name}")
+            result = process_got_ocr_output(result, ocr_method_name, output_format)
         return result

src/parsers/got_ocr_integration.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import logging
+from typing import Optional, Dict, Any
+import os
+from pathlib import Path
+# Import the LaTeX converter utility
+from src.utils.latex_converter import convert_latex_to_markdown
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+def process_got_ocr_output(output_text: str, ocr_method: str, output_format: str) -> str:
+    """
+    Process the output from GOT-OCR parser and convert if needed.
+    Args:
+        output_text: The raw output text from the GOT-OCR parser
+        ocr_method: The OCR method used (Plain Text, Formatted Text)
+        output_format: The desired output format (Markdown, etc.)
+    Returns:
+        str: The processed text
+    """
+    if not output_text:
+        return ""
+    # If not using formatted text or not requesting Markdown, return the original text
+    if ocr_method.lower() != "formatted text" or output_format.lower() != "markdown":
+        return output_text
+    # Process the formatted text (LaTeX) into enhanced Markdown
+    logger.info("Converting LaTeX output to enhanced Markdown format")
+    try:
+        markdown_text = convert_latex_to_markdown(output_text)
+        logger.info("LaTeX to Markdown conversion successful")
+        return markdown_text
+    except Exception as e:
+        logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
+        # Return the original text if conversion fails
+        return output_text

src/parsers/got_ocr_parser.py CHANGED Viewed

@@ -227,9 +227,10 @@ class GotOcrParser(DocumentParser):
                 skip_special_tokens=True,
             )
-            # Convert to Markdown if it's formatted
-            l2m = latex2markdown.LaTeX2Markdown(result)
-            result = l2m.to_markdown()
         else:
             # Plain text mode
             inputs = processor([image], return_tensors="pt")
@@ -318,9 +319,9 @@ class GotOcrParser(DocumentParser):
                     skip_special_tokens=True,
                 )
-                # Convert to Markdown if it's formatted
-                l2m = latex2markdown.LaTeX2Markdown(result)
-                result = l2m.to_markdown()
             else:
                 # Plain text mode
                 inputs = processor([image], return_tensors="pt")

                 skip_special_tokens=True,
             )
+            # Return raw LaTeX output - let post-processing handle conversion
+            # This allows for more advanced conversion in the integration module
+            logger.info("Returning raw LaTeX output for external processing")
         else:
             # Plain text mode
             inputs = processor([image], return_tensors="pt")
                     skip_special_tokens=True,
                 )
+                # Return raw LaTeX output - let post-processing handle conversion
+                # This allows for more advanced conversion in the integration module
+                logger.info("Returning raw LaTeX output for external processing")
             else:
                 # Plain text mode
                 inputs = processor([image], return_tensors="pt")

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Utilities package for Markit.
+Contains shared utility functions and helper modules.
+"""

src/utils/latex_converter.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import re
+import logging
+from typing import Dict, List, Tuple, Optional
+import latex2markdown
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class LatexConverter:
+    """Enhanced LaTeX to Markdown converter that handles complex LaTeX structures."""
+    @staticmethod
+    def convert(latex_text: str) -> str:
+        """
+        Convert LaTeX text to Markdown, with special handling for tables and other structures.
+        Args:
+            latex_text: Raw LaTeX text from the GOT-OCR model
+        Returns:
+            str: Converted Markdown text
+        """
+        if not latex_text or not isinstance(latex_text, str):
+            return ""
+        # Process the text in stages
+        processed_text = latex_text
+        # Stage 1: Pre-process tables before standard conversion
+        processed_text = LatexConverter._preprocess_tables(processed_text)
+        # Stage 2: Convert using latex2markdown library
+        try:
+            # Use the standard latex2markdown library as a base - FOLLOWING OFFICIAL DOCUMENTATION
+            l2m = latex2markdown.LaTeX2Markdown(processed_text)
+            processed_text = l2m.to_markdown()
+        except Exception as e:
+            logger.error(f"Error in standard latex2markdown conversion: {str(e)}")
+            # Continue with our custom processing even if the standard library fails
+        # Stage 3: Post-process to fix any remaining issues
+        processed_text = LatexConverter._postprocess_markdown(processed_text)
+        return processed_text
+    @staticmethod
+    def _preprocess_tables(latex_text: str) -> str:
+        """
+        Pre-process LaTeX tables to ensure they convert correctly.
+        Args:
+            latex_text: Raw LaTeX text
+        Returns:
+            str: Pre-processed LaTeX text with table modifications
+        """
+        processed_text = latex_text
+        # Find all tabular environments
+        table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
+        tables = re.findall(table_pattern, processed_text, re.DOTALL)
+        for i, table_content in enumerate(tables):
+            # Extract the column specification
+            col_spec_match = re.search(r'{([^}]*)}', table_content)
+            if not col_spec_match:
+                continue
+            # Process the table content
+            rows_text = re.sub(r'{[^}]*}', '', table_content, count=1)  # Remove the column spec
+            # Split into rows by \\ or \hline
+            rows = re.split(r'\\\\|\\hline', rows_text)
+            rows = [row.strip() for row in rows if row.strip()]
+            # Calculate number of columns based on the number of & in the first non-empty row plus 1
+            for row in rows:
+                if '&' in row:
+                    num_cols = row.count('&') + 1
+                    break
+            else:
+                num_cols = 1  # Default if no & found
+            # Create a clean tabular environment that's easier to parse
+            clean_table = f"\\begin{{tabular}}{{{'|'.join(['c'] * num_cols)}}}\n"
+            for row in rows:
+                if row.strip():
+                    clean_row = ' & '.join([cell.strip() for cell in row.split('&')])
+                    clean_table += clean_row + " \\\\\n"
+            clean_table += "\\end{tabular}"
+            # Replace the original table with the clean one
+            processed_text = processed_text.replace(
+                f"\\begin{tabular}{table_content}\\end{tabular}",
+                clean_table
+            )
+        return processed_text
+    @staticmethod
+    def _postprocess_markdown(markdown_text: str) -> str:
+        """
+        Post-process the converted Markdown to fix any remaining issues.
+        Args:
+            markdown_text: Converted Markdown text
+        Returns:
+            str: Post-processed Markdown text
+        """
+        processed_text = markdown_text
+        # Fix common issues with tables
+        # 1. Fix pipe tables that may be malformed
+        table_lines = []
+        in_table = False
+        for line in processed_text.split('\n'):
+            if '|' in line and not line.strip().startswith('|') and not in_table:
+                # This might be the start of a table, add the missing pipe
+                line = '| ' + line
+                in_table = True
+            if in_table:
+                if '|' in line:
+                    # Ensure line ends with pipe
+                    if not line.strip().endswith('|'):
+                        line = line + ' |'
+                    table_lines.append(line)
+                else:
+                    # End of table
+                    in_table = False
+                    # If this is a table, add a header separator row after the first row
+                    if len(table_lines) > 0:
+                        col_count = table_lines[0].count('|') - 1
+                        separator = '| ' + ' | '.join(['---'] * col_count) + ' |'
+                        table_lines.insert(1, separator)
+                    # Add the current line and the processed table
+                    for table_line in table_lines:
+                        processed_text = processed_text.replace(table_line, table_line)
+                    table_lines = []
+        # Fix math blocks
+        processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
+        processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)
+        # Fix formatting issues
+        processed_text = processed_text.replace('\\textbf{', '**')
+        processed_text = processed_text.replace('\\textit{', '*')
+        processed_text = processed_text.replace('}', '')  # Remove closing braces
+        # Fix escape sequences
+        processed_text = processed_text.replace('\\%', '%')
+        processed_text = processed_text.replace('\\$', '$')
+        processed_text = processed_text.replace('\\&', '&')
+        return processed_text
+def convert_latex_to_markdown(latex_text: str) -> str:
+    """
+    Convenience function to convert LaTeX to Markdown.
+    Args:
+        latex_text: Raw LaTeX text from the GOT-OCR model
+    Returns:
+        str: Converted Markdown text
+    """
+    return LatexConverter.convert(latex_text)