Spaces:

Ansemin101
/

Markit_v2

Runtime error

App Files Files Community

AnseMin commited on Mar 17

Commit

04b3509

1 Parent(s): c337a68

removing markers PDF for GOT OCR

Browse files

Files changed (4) hide show

README.md +0 -4
requirements.txt +0 -1
src/parsers/__init__.py +0 -1
src/parsers/marker_parser.py +0 -61

README.md CHANGED Viewed

@@ -30,7 +30,6 @@ Markit is a powerful tool that converts various document formats (PDF, DOCX, ima
 - **Advanced Parsing Engines**:
   - **PyPdfium**: Fast PDF parsing using the PDFium engine
   - **Docling**: Advanced document structure analysis
-  - **Marker**: Specialized for markup and formatting
   - **Gemini Flash**: AI-powered conversion using Google's Gemini API
   - **GOT-OCR**: State-of-the-art OCR model for images (JPG/PNG only)
 - **OCR Integration**: Extract text from images and scanned documents using Tesseract OCR
@@ -121,9 +120,7 @@ build:
 2. Select a parser provider:
    - **PyPdfium**: Best for standard PDFs with selectable text
    - **Docling**: Best for complex document layouts
-   - **Marker**: Best for preserving document formatting
    - **Gemini Flash**: Best for AI-powered conversions (requires API key)
-   - **GOT-OCR**: Best for high-quality OCR on images (JPG/PNG only)
 3. Choose an OCR option based on your selected parser:
    - **None**: No OCR processing (for documents with selectable text)
    - **Tesseract**: Basic OCR using Tesseract
@@ -188,7 +185,6 @@ markit/
 │   │   ├── parser_interface.py # Parser interface
 │   │   ├── parser_registry.py # Parser registry
 │   │   ├── docling_parser.py # Docling parser
-│   │   ├── marker_parser.py # Marker parser
 │   │   └── pypdfium_parser.py # PyPDFium parser
 │   ├── ui/                 # User interface
 │   │   ├── __init__.py     # Package initialization

 - **Advanced Parsing Engines**:
   - **PyPdfium**: Fast PDF parsing using the PDFium engine
   - **Docling**: Advanced document structure analysis
   - **Gemini Flash**: AI-powered conversion using Google's Gemini API
   - **GOT-OCR**: State-of-the-art OCR model for images (JPG/PNG only)
 - **OCR Integration**: Extract text from images and scanned documents using Tesseract OCR
 2. Select a parser provider:
    - **PyPdfium**: Best for standard PDFs with selectable text
    - **Docling**: Best for complex document layouts
    - **Gemini Flash**: Best for AI-powered conversions (requires API key)
 3. Choose an OCR option based on your selected parser:
    - **None**: No OCR processing (for documents with selectable text)
    - **Tesseract**: Basic OCR using Tesseract
 │   │   ├── parser_interface.py # Parser interface
 │   │   ├── parser_registry.py # Parser registry
 │   │   ├── docling_parser.py # Docling parser
 │   │   └── pypdfium_parser.py # PyPDFium parser
 │   ├── ui/                 # User interface
 │   │   ├── __init__.py     # Package initialization

requirements.txt CHANGED Viewed

@@ -2,7 +2,6 @@ docling==2.25.0
 gradio==5.14.0
 grpcio-status==1.70.0
 markdown==3.7
-marker-pdf==1.3.5
 multiprocess==0.70.16
 pipdeptree==2.25.0
 pytesseract==0.3.13

 gradio==5.14.0
 grpcio-status==1.70.0
 markdown==3.7
 multiprocess==0.70.16
 pipdeptree==2.25.0
 pytesseract==0.3.13

src/parsers/__init__.py CHANGED Viewed

@@ -2,7 +2,6 @@
 # Import all parsers to ensure they're registered
 from src.parsers.docling_parser import DoclingParser
-from src.parsers.marker_parser import MarkerParser
 from src.parsers.pypdfium_parser import PyPdfiumParser
 from src.parsers.gemini_flash_parser import GeminiFlashParser
 from src.parsers.got_ocr_parser import GotOcrParser

 # Import all parsers to ensure they're registered
 from src.parsers.docling_parser import DoclingParser
 from src.parsers.pypdfium_parser import PyPdfiumParser
 from src.parsers.gemini_flash_parser import GeminiFlashParser
 from src.parsers.got_ocr_parser import GotOcrParser

src/parsers/marker_parser.py DELETED Viewed

@@ -1,61 +0,0 @@
-from pathlib import Path
-from typing import Dict, List, Optional, Any, Union
-import subprocess
-import tempfile
-import os
-import json
-from src.parsers.parser_interface import DocumentParser
-from src.parsers.parser_registry import ParserRegistry
-from marker.converters.pdf import PdfConverter
-from marker.models import create_model_dict
-from marker.output import text_from_rendered
-class MarkerParser(DocumentParser):
-    """Parser implementation using Marker."""
-    @classmethod
-    def get_name(cls) -> str:
-        return "Marker"
-    @classmethod
-    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
-        return [
-            {
-                "id": "no_ocr",
-                "name": "No OCR",
-                "default_params": {}
-            },
-            {
-                "id": "force_ocr",
-                "name": "Force OCR",
-                "default_params": {}
-            }
-        ]
-    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
-        """Parse a document using Marker."""
-        force_ocr = ocr_method == "force_ocr"
-        converter = PdfConverter(
-            artifact_dict=create_model_dict(),
-            config={"force_ocr": force_ocr}
-        )
-        rendered = converter(str(file_path))
-        content, _, _ = text_from_rendered(rendered)
-        # Format the content based on the requested output format
-        output_format = kwargs.get("output_format", "markdown")
-        if output_format.lower() == "json":
-            return json.dumps({"content": content}, ensure_ascii=False, indent=2)
-        elif output_format.lower() == "text":
-            return content.replace("#", "").replace("*", "").replace("_", "")
-        elif output_format.lower() == "document_tags":
-            return f"<doc>\n{content}\n</doc>"
-        else:
-            return content
-# Register the parser with the registry
-ParserRegistry.register(MarkerParser)