Spaces:

Ansemin101
/

Markit_v2

Running on Zero

App Files Files Community

AnseMin commited on Mar 19

Commit

23ad33e

1 Parent(s): 34d180e

restore to version 1

Browse files

Files changed (7) hide show

app.py +2 -18
requirements.txt +0 -1
setup.sh +1 -1
src/core/parser_factory.py +0 -8
src/parsers/got_ocr_integration.py +0 -41
src/utils/__init__.py +0 -4
src/utils/latex_converter.py +0 -194

app.py CHANGED Viewed

@@ -50,24 +50,8 @@ try:
     import transformers
     print(f"Transformers version: {transformers.__version__}")
 except ImportError:
-    print("WARNING: Transformers not installed. Installing from GitHub...")
-    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main"], check=False)
-# Check if latex2markdown module is installed (needed for LaTeX conversion)
-try:
-    import latex2markdown
-    print("LaTeX2Markdown module found for advanced LaTeX conversion")
-except ImportError:
-    print("WARNING: LaTeX2Markdown module not found. Installing...")
-    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "latex2markdown"], check=False)
-# Check if regex module is installed (needed for LaTeX conversion)
-try:
-    import regex
-    print(f"Regex module found: {regex.__version__ if hasattr(regex, '__version__') else 'version unknown'}")
-except ImportError:
-    print("WARNING: Regex module not found. Installing...")
-    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "regex>=2023.0.0"], check=False)
 # Check if numpy is installed with the correct version
 try:

     import transformers
     print(f"Transformers version: {transformers.__version__}")
 except ImportError:
+    print("WARNING: Transformers not installed. Installing transformers from GitHub...")
+    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "git+https://github.com/huggingface/transformers.git@main", "accelerate", "verovio"], check=False)
 # Check if numpy is installed with the correct version
 try:

requirements.txt CHANGED Viewed

@@ -14,7 +14,6 @@ opencv-python  # Match exact dependency from GOT-OCR
 python-dotenv>=1.0.0
 pydantic==2.7.1
 latex2markdown>=0.1.0  # For LaTeX to Markdown conversion
-regex>=2023.0.0  # For advanced regex pattern matching
 # Gemini API client
 google-genai>=0.1.0

 python-dotenv>=1.0.0
 pydantic==2.7.1
 latex2markdown>=0.1.0  # For LaTeX to Markdown conversion
 # Gemini API client
 google-genai>=0.1.0

setup.sh CHANGED Viewed

@@ -29,7 +29,7 @@ echo "NumPy installed successfully"
 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
-pip install -q -U latex2markdown regex>=2023.0.0
 echo "Python dependencies installed successfully"
 # Install GOT-OCR transformers dependencies

 echo "Installing Python dependencies..."
 pip install -q -U pillow opencv-python
 pip install -q -U google-genai
+pip install -q -U latex2markdown
 echo "Python dependencies installed successfully"
 # Install GOT-OCR transformers dependencies

src/core/parser_factory.py CHANGED Viewed

@@ -7,9 +7,6 @@ import time
 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
-# Import the GOT-OCR integration module for post-processing
-from src.parsers.got_ocr_integration import process_got_ocr_output
 class ParserFactory:
     """Factory for creating parser instances."""
@@ -94,10 +91,5 @@ class ParserFactory:
         # Check one more time after parsing completes
         if check_cancellation():
             return "Conversion cancelled."
-        # Post-process the result for GOT-OCR parser
-        if "GOT-OCR" in parser_name:
-            logging.info(f"Post-processing GOT-OCR output for {ocr_method_name}")
-            result = process_got_ocr_output(result, ocr_method_name, output_format)
         return result

 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
 class ParserFactory:
     """Factory for creating parser instances."""
         # Check one more time after parsing completes
         if check_cancellation():
             return "Conversion cancelled."
         return result

src/parsers/got_ocr_integration.py DELETED Viewed

@@ -1,41 +0,0 @@
-import logging
-from typing import Optional, Dict, Any
-import os
-from pathlib import Path
-# Import the LaTeX converter utility
-from src.utils.latex_converter import convert_latex_to_markdown
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-def process_got_ocr_output(output_text: str, ocr_method: str, output_format: str) -> str:
-    """
-    Process the output from GOT-OCR parser and convert if needed.
-    Args:
-        output_text: The raw output text from the GOT-OCR parser
-        ocr_method: The OCR method used (Plain Text, Formatted Text)
-        output_format: The desired output format (Markdown, etc.)
-    Returns:
-        str: The processed text
-    """
-    if not output_text:
-        return ""
-    # If not using formatted text or not requesting Markdown, return the original text
-    if ocr_method.lower() != "formatted text" or output_format.lower() != "markdown":
-        return output_text
-    # Process the formatted text (LaTeX) into enhanced Markdown
-    logger.info("Converting LaTeX output to enhanced Markdown format")
-    try:
-        markdown_text = convert_latex_to_markdown(output_text)
-        logger.info("LaTeX to Markdown conversion successful")
-        return markdown_text
-    except Exception as e:
-        logger.error(f"Error converting LaTeX to Markdown: {str(e)}")
-        # Return the original text if conversion fails
-        return output_text

src/utils/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-"""
-Utilities package for Markit.
-Contains shared utility functions and helper modules.
-"""

src/utils/latex_converter.py DELETED Viewed

@@ -1,194 +0,0 @@
-import re
-import logging
-from typing import Dict, List, Tuple, Optional
-import latex2markdown
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-class LatexConverter:
-    """Enhanced LaTeX to Markdown converter that handles complex LaTeX structures."""
-    @staticmethod
-    def convert(latex_text: str) -> str:
-        """
-        Convert LaTeX text to Markdown, with special handling for tables and other structures.
-        Args:
-            latex_text: Raw LaTeX text from the GOT-OCR model
-        Returns:
-            str: Converted Markdown text
-        """
-        if not latex_text or not isinstance(latex_text, str):
-            return ""
-        # Process the text in stages
-        processed_text = latex_text
-        # Stage 1: Pre-process tables before standard conversion
-        processed_text, tables_dict = LatexConverter._extract_tables(processed_text)
-        # Stage 2: Convert using latex2markdown library
-        try:
-            # Use the standard latex2markdown library as a base - FOLLOWING OFFICIAL DOCUMENTATION
-            l2m = latex2markdown.LaTeX2Markdown(processed_text)
-            processed_text = l2m.to_markdown()
-        except Exception as e:
-            logger.error(f"Error in standard latex2markdown conversion: {str(e)}")
-            # Continue with our custom processing even if the standard library fails
-        # Stage 3: Post-process to fix any remaining issues
-        processed_text = LatexConverter._postprocess_markdown(processed_text)
-        # Stage 4: Reinsert tables as markdown tables
-        processed_text = LatexConverter._reinsert_tables(processed_text, tables_dict)
-        return processed_text
-    @staticmethod
-    def _extract_tables(latex_text: str) -> tuple:
-        """
-        Extract tables from LaTeX and replace with placeholders.
-        Args:
-            latex_text: Raw LaTeX text
-        Returns:
-            tuple: (processed text with placeholders, dict of tables)
-        """
-        processed_text = latex_text
-        tables_dict = {}
-        # Find all tabular environments
-        table_pattern = r'\\begin{tabular}(.*?)\\end{tabular}'
-        tables = re.findall(table_pattern, processed_text, re.DOTALL)
-        for i, table_content in enumerate(tables):
-            placeholder = f"TABLE_PLACEHOLDER_{i}"
-            tables_dict[placeholder] = table_content
-            # Replace the table with a placeholder
-            processed_text = processed_text.replace(
-                f"\\begin{{tabular}}{table_content}\\end{{tabular}}",
-                placeholder
-            )
-        return processed_text, tables_dict
-    @staticmethod
-    def _reinsert_tables(markdown_text: str, tables_dict: dict) -> str:
-        """
-        Convert LaTeX tables to Markdown tables and reinsert them.
-        Args:
-            markdown_text: Processed markdown text with placeholders
-            tables_dict: Dictionary of tables extracted from LaTeX
-        Returns:
-            str: Markdown text with tables converted and reinserted
-        """
-        processed_text = markdown_text
-        for placeholder, table_content in tables_dict.items():
-            # Convert LaTeX table to Markdown table
-            markdown_table = LatexConverter._convert_table_to_markdown(table_content)
-            # Replace the placeholder with the Markdown table
-            processed_text = processed_text.replace(placeholder, markdown_table)
-        return processed_text
-    @staticmethod
-    def _convert_table_to_markdown(table_content: str) -> str:
-        """
-        Convert a LaTeX table to Markdown format.
-        Args:
-            table_content: LaTeX table content
-        Returns:
-            str: Markdown table
-        """
-        # Extract the column specification
-        col_spec_match = re.search(r'{([^}]*)}', table_content)
-        if not col_spec_match:
-            return f"[Table conversion failed]"
-        # Process the table content
-        rows_text = re.sub(r'{[^}]*}', '', table_content, count=1)  # Remove the column spec
-        # Split into rows by \\ or \hline
-        rows = re.split(r'\\\\|\\hline', rows_text)
-        rows = [row.strip() for row in rows if row.strip()]
-        if not rows:
-            return "[Empty table]"
-        # Calculate number of columns based on the number of & in the first non-empty row plus 1
-        num_cols = 1  # Default
-        for row in rows:
-            if '&' in row:
-                num_cols = row.count('&') + 1
-                break
-        # Build markdown table
-        markdown_table = []
-        # Add header row
-        if rows:
-            first_row = rows[0]
-            cells = [cell.strip() for cell in first_row.split('&')]
-            markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
-            # Add separator row
-            markdown_table.append("| " + " | ".join(["---"] * num_cols) + " |")
-            # Add data rows
-            for row in rows[1:]:
-                cells = [cell.strip() for cell in row.split('&')]
-                markdown_table.append("| " + " | ".join(cells + [""] * (num_cols - len(cells))) + " |")
-        return "\n".join(markdown_table)
-    @staticmethod
-    def _postprocess_markdown(markdown_text: str) -> str:
-        """
-        Post-process the converted Markdown to fix any remaining issues.
-        Args:
-            markdown_text: Converted Markdown text
-        Returns:
-            str: Post-processed Markdown text
-        """
-        processed_text = markdown_text
-        # Fix math blocks
-        processed_text = re.sub(r'\\\[(.*?)\\\]', r'$$\1$$', processed_text, flags=re.DOTALL)
-        processed_text = re.sub(r'\\\((.*?)\\\)', r'$\1$', processed_text, flags=re.DOTALL)
-        # Fix formatting issues
-        processed_text = processed_text.replace('\\textbf{', '**')
-        processed_text = processed_text.replace('\\textit{', '*')
-        processed_text = processed_text.replace('}', '')  # Remove closing braces
-        # Fix escape sequences
-        processed_text = processed_text.replace('\\%', '%')
-        processed_text = processed_text.replace('\\$', '$')
-        processed_text = processed_text.replace('\\&', '&')
-        return processed_text
-def convert_latex_to_markdown(latex_text: str) -> str:
-    """
-    Convenience function to convert LaTeX to Markdown.
-    Args:
-        latex_text: Raw LaTeX text from the GOT-OCR model
-    Returns:
-        str: Converted Markdown text
-    """
-    return LatexConverter.convert(latex_text)