Spaces:

Ansemin101
/

Markit_v2

Runtime error

App Files Files Community

AnseMin commited on Mar 18

Commit

ad248f7

1 Parent(s): 2184c47

script to convert markdown to latex, changing UI output to fit right with got ocr

Browse files

Files changed (4) hide show

src/parsers/got_ocr_parser.py +14 -1
src/ui/ui.py +6 -6
src/utils/__init__.py +5 -0
src/utils/latex_converter.py +186 -0

src/parsers/got_ocr_parser.py CHANGED Viewed

@@ -11,6 +11,7 @@ os.environ["TORCH_AMP_AUTOCAST_DTYPE"] = "float16"
 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
 # Configure logging
 logger = logging.getLogger(__name__)
@@ -159,7 +160,7 @@ class GotOcrParser(DocumentParser):
             **kwargs: Additional arguments to pass to the model
         Returns:
-            Extracted text from the image
         """
         # Verify dependencies are installed
         if not self._check_dependencies():
@@ -211,6 +212,12 @@ class GotOcrParser(DocumentParser):
                             str(file_path),
                             ocr_type='ocr'
                         )
                 return result
             except RuntimeError as e:
                 # Check if it's a bfloat16 error
@@ -243,6 +250,12 @@ class GotOcrParser(DocumentParser):
                         # Restore default dtype
                         torch.set_default_dtype(original_dtype)
                         return result
                     except Exception as inner_e:
                         logger.error(f"Float16 fallback failed: {str(inner_e)}")

 from src.parsers.parser_interface import DocumentParser
 from src.parsers.parser_registry import ParserRegistry
+from src.utils.latex_converter import latex_to_markdown
 # Configure logging
 logger = logging.getLogger(__name__)
             **kwargs: Additional arguments to pass to the model
         Returns:
+            Extracted text from the image, converted to Markdown if formatted
         """
         # Verify dependencies are installed
         if not self._check_dependencies():
                             str(file_path),
                             ocr_type='ocr'
                         )
+                # Convert LaTeX to Markdown for better display in UI
+                if ocr_type == "format":
+                    logger.info("Converting formatted LaTeX output to Markdown")
+                    result = latex_to_markdown(result)
                 return result
             except RuntimeError as e:
                 # Check if it's a bfloat16 error
                         # Restore default dtype
                         torch.set_default_dtype(original_dtype)
+                        # Convert LaTeX to Markdown for better display in UI
+                        if ocr_type == "format":
+                            logger.info("Converting formatted LaTeX output to Markdown")
+                            result = latex_to_markdown(result)
                         return result
                     except Exception as inner_e:
                         logger.error(f"Float16 fallback failed: {str(inner_e)}")

src/ui/ui.py CHANGED Viewed

@@ -80,13 +80,13 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
     # Check if we should cancel before starting
     if is_cancelled:
         logger.info("Conversion cancelled before starting")
-        return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
     # Validate file type for the selected parser
     is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
     if not is_valid:
         logger.error(f"File validation error: {error_msg}")
-        return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
     logger.info("Starting conversion with cancellation flag cleared")
@@ -107,14 +107,14 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
             thread.join(timeout=0.5)
             if thread.is_alive():
                 logger.warning("Thread did not finish within timeout")
-            return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
         # Sleep briefly to avoid busy waiting
         time.sleep(0.1)
     # Thread has completed, check results
     if results["error"]:
-        return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
     content = results["content"]
     download_file = results["download_file"]
@@ -122,14 +122,14 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
     # If conversion returned a cancellation message
     if content == "Conversion cancelled.":
         logger.info("Converter returned cancellation message")
-        return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
     # Format the content and wrap it in the scrollable container
     formatted_content = format_markdown_content(str(content))
     html_output = f"<div class='output-container'>{formatted_content}</div>"
     logger.info("Conversion completed successfully")
-    return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), None
 def create_ui():
     with gr.Blocks(css="""

     # Check if we should cancel before starting
     if is_cancelled:
         logger.info("Conversion cancelled before starting")
+        return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
     # Validate file type for the selected parser
     is_valid, error_msg = validate_file_for_parser(file_path, parser_name)
     if not is_valid:
         logger.error(f"File validation error: {error_msg}")
+        return f"Error: {error_msg}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
     logger.info("Starting conversion with cancellation flag cleared")
             thread.join(timeout=0.5)
             if thread.is_alive():
                 logger.warning("Thread did not finish within timeout")
+            return "Conversion cancelled.", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
         # Sleep briefly to avoid busy waiting
         time.sleep(0.1)
     # Thread has completed, check results
     if results["error"]:
+        return f"Error: {results['error']}", None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
     content = results["content"]
     download_file = results["download_file"]
     # If conversion returned a cancellation message
     if content == "Conversion cancelled.":
         logger.info("Converter returned cancellation message")
+        return content, None, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
     # Format the content and wrap it in the scrollable container
     formatted_content = format_markdown_content(str(content))
     html_output = f"<div class='output-container'>{formatted_content}</div>"
     logger.info("Conversion completed successfully")
+    return html_output, download_file, gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
 def create_ui():
     with gr.Blocks(css="""

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Utility functions for the Markit application."""
+from src.utils.latex_converter import latex_to_markdown
+__all__ = ['latex_to_markdown']

src/utils/latex_converter.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import re
+import logging
+# Configure logging
+logger = logging.getLogger(__name__)
+def latex_to_markdown(latex_text):
+    """
+    Convert LaTeX formatted text from GOT-OCR to Markdown.
+    Args:
+        latex_text (str): LaTeX formatted text
+    Returns:
+        str: Markdown formatted text
+    """
+    if not latex_text:
+        return ""
+    logger.info("Converting LaTeX to Markdown")
+    # Make a copy of the input text
+    md_text = latex_text
+    # Handle LaTeX tables
+    md_text = convert_latex_tables(md_text)
+    # Handle LaTeX math environments
+    md_text = convert_math_environments(md_text)
+    # Handle LaTeX formatting commands
+    md_text = convert_formatting_commands(md_text)
+    # Handle LaTeX lists
+    md_text = convert_latex_lists(md_text)
+    # Clean up any remaining LaTeX-specific syntax
+    md_text = cleanup_latex(md_text)
+    logger.info("LaTeX to Markdown conversion completed")
+    return md_text
+def convert_latex_tables(latex_text):
+    """Convert LaTeX tables to Markdown tables."""
+    result = latex_text
+    # Detect and convert tabular environments
+    tabular_pattern = r'\\begin\{(tabular|table)\}(.*?)\\end\{(tabular|table)\}'
+    def replace_table(match):
+        table_content = match.group(2)
+        # Extract rows
+        rows = re.split(r'\\\\', table_content)
+        md_rows = []
+        # Create header separator after first row
+        if rows:
+            first_row = rows[0]
+            # Count columns based on & separators
+            col_count = first_row.count('&') + 1
+            # Process rows
+            for i, row in enumerate(rows):
+                # Skip empty rows
+                if not row.strip():
+                    continue
+                # Split by & to get cells
+                cells = row.split('&')
+                # Clean cell content
+                cells = [cell.strip().replace('\\hline', '') for cell in cells]
+                # Join cells with | for Markdown table format
+                md_row = '| ' + ' | '.join(cells) + ' |'
+                md_rows.append(md_row)
+                # Add header separator after first row
+                if i == 0:
+                    md_rows.append('| ' + ' | '.join(['---'] * col_count) + ' |')
+        return '\n'.join(md_rows)
+    # Replace all tabular environments
+    result = re.sub(tabular_pattern, replace_table, result, flags=re.DOTALL)
+    return result
+def convert_math_environments(latex_text):
+    """Convert LaTeX math environments to Markdown math syntax."""
+    result = latex_text
+    # Convert equation environments to $$ ... $$ format
+    result = re.sub(r'\\begin\{equation\}(.*?)\\end\{equation\}', r'$$\1$$', result, flags=re.DOTALL)
+    result = re.sub(r'\\begin\{align\}(.*?)\\end\{align\}', r'$$\1$$', result, flags=re.DOTALL)
+    result = re.sub(r'\\begin\{eqnarray\}(.*?)\\end\{eqnarray\}', r'$$\1$$', result, flags=re.DOTALL)
+    # Convert inline math $ ... $ (if not already in right format)
+    result = re.sub(r'\\(\(|\))', '$', result)
+    # Handle standalone math expressions
+    result = re.sub(r'\\begin\{math\}(.*?)\\end\{math\}', r'$\1$', result, flags=re.DOTALL)
+    return result
+def convert_formatting_commands(latex_text):
+    """Convert LaTeX formatting commands to Markdown syntax."""
+    result = latex_text
+    # Bold: \textbf{text} -> **text**
+    result = re.sub(r'\\textbf\{([^}]*)\}', r'**\1**', result)
+    result = re.sub(r'\\bf\{([^}]*)\}', r'**\1**', result)
+    # Italic: \textit{text} -> *text*
+    result = re.sub(r'\\textit\{([^}]*)\}', r'*\1*', result)
+    result = re.sub(r'\\it\{([^}]*)\}', r'*\1*', result)
+    result = re.sub(r'\\emph\{([^}]*)\}', r'*\1*', result)
+    # Underline: don't have direct equivalent in MD, use emphasis
+    result = re.sub(r'\\underline\{([^}]*)\}', r'_\1_', result)
+    # Section headings
+    result = re.sub(r'\\section\{([^}]*)\}', r'## \1', result)
+    result = re.sub(r'\\subsection\{([^}]*)\}', r'### \1', result)
+    result = re.sub(r'\\subsubsection\{([^}]*)\}', r'#### \1', result)
+    # Remove \title command
+    result = re.sub(r'\\title\{([^}]*)\}', r'# \1', result)
+    return result
+def convert_latex_lists(latex_text):
+    """Convert LaTeX lists to Markdown lists."""
+    result = latex_text
+    # Handle itemize (unordered lists)
+    itemize_pattern = r'\\begin\{itemize\}(.*?)\\end\{itemize\}'
+    def replace_itemize(match):
+        list_content = match.group(1)
+        items = re.findall(r'\\item\s+(.*?)(?=\\item|$)', list_content, re.DOTALL)
+        return '\n' + '\n'.join([f'- {item.strip()}' for item in items]) + '\n'
+    result = re.sub(itemize_pattern, replace_itemize, result, flags=re.DOTALL)
+    # Handle enumerate (ordered lists)
+    enumerate_pattern = r'\\begin\{enumerate\}(.*?)\\end\{enumerate\}'
+    def replace_enumerate(match):
+        list_content = match.group(1)
+        items = re.findall(r'\\item\s+(.*?)(?=\\item|$)', list_content, re.DOTALL)
+        return '\n' + '\n'.join([f'{i+1}. {item.strip()}' for i, item in enumerate(items)]) + '\n'
+    result = re.sub(enumerate_pattern, replace_enumerate, result, flags=re.DOTALL)
+    return result
+def cleanup_latex(latex_text):
+    """Clean up any remaining LaTeX-specific syntax."""
+    result = latex_text
+    # Remove LaTeX document structure commands
+    result = re.sub(r'\\begin\{document\}|\\end\{document\}', '', result)
+    result = re.sub(r'\\maketitle', '', result)
+    result = re.sub(r'\\documentclass\{[^}]*\}', '', result)
+    result = re.sub(r'\\usepackage\{[^}]*\}', '', result)
+    # Convert special characters
+    latex_special_chars = {
+        r'\&': '&',
+        r'\%': '%',
+        r'\$': '$',
+        r'\#': '#',
+        r'\_': '_',
+        r'\{': '{',
+        r'\}': '}',
+        r'~': ' ',
+        r'\ldots': '...'
+    }
+    for latex_char, md_char in latex_special_chars.items():
+        result = result.replace(latex_char, md_char)
+    # Fix extra whitespace
+    result = re.sub(r'\n\s*\n\s*\n', '\n\n', result)
+    return result