Spaces:

Ansemin101
/

Markit_v2

Running on Zero

App Files Files Community

AnseMin commited on Mar 17

Commit

b3a5734

1 Parent(s): 9ddb112

remove docling and pypdfium because of independencies

Browse files

Files changed (6) hide show

README.md +1 -1
requirements.txt +12 -10
src/core/converter.py +1 -1
src/parsers/__init__.py +0 -2
src/parsers/docling_parser.py +0 -161
src/parsers/pypdfium_parser.py +0 -78

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Markit v2
 emoji: 📄
 colorFrom: blue
 colorTo: indigo

 ---
+title: Markit GOT OCR
 emoji: 📄
 colorFrom: blue
 colorTo: indigo

requirements.txt CHANGED Viewed

@@ -1,27 +1,29 @@
-docling==2.25.0
 gradio==5.14.0
 grpcio-status==1.70.0
 markdown==3.7
 multiprocess==0.70.16
 pipdeptree==2.25.0
-pytesseract==0.3.13
-semchunk==2.2.2
 Pillow>=9.0.0,<11.0.0
 numpy>=1.21.0
-# Tesseract dependencies
 tesseract==0.1.3
 tesserocr>=2.5.0; platform_system != "Windows"  # Only install on non-Windows systems
-# Additional dependencies for image processing
 opencv-python-headless>=4.5.0  # Headless version for server environments
-pdf2image>=1.16.0  # For PDF processing
 dill==0.3.8  # Downgraded to be compatible with datasets
-# Gemini API client
-google-genai>=0.1.0
-# Environment variables
 python-dotenv>=1.0.0
-# Pin pydantic to resolve compatibility issues with gradio
 pydantic==2.7.1
 # GOT-OCR dependencies
 torch>=2.0.1
 torchvision>=0.15.2

+# Core dependencies
 gradio==5.14.0
 grpcio-status==1.70.0
 markdown==3.7
 multiprocess==0.70.16
 pipdeptree==2.25.0
 Pillow>=9.0.0,<11.0.0
 numpy>=1.21.0
+# PDF processing
+pdf2image>=1.16.0
+# OCR dependencies (for GOT-OCR)
+pytesseract==0.3.13
 tesseract==0.1.3
 tesserocr>=2.5.0; platform_system != "Windows"  # Only install on non-Windows systems
 opencv-python-headless>=4.5.0  # Headless version for server environments
+# Utility dependencies
 dill==0.3.8  # Downgraded to be compatible with datasets
 python-dotenv>=1.0.0
 pydantic==2.7.1
+# Gemini API client
+google-genai>=0.1.0
 # GOT-OCR dependencies
 torch>=2.0.1
 torchvision>=0.15.2

src/core/converter.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pathlib import Path
 from src.core.parser_factory import ParserFactory
 # Import all parsers to ensure they're registered
-import parsers
 # Reference to the cancellation flag from ui.py
 # This will be set by the UI when the cancel button is clicked

 from src.core.parser_factory import ParserFactory
 # Import all parsers to ensure they're registered
+from src import parsers
 # Reference to the cancellation flag from ui.py
 # This will be set by the UI when the cancel button is clicked

src/parsers/__init__.py CHANGED Viewed

@@ -1,8 +1,6 @@
 """Parser implementations for document conversion."""
 # Import all parsers to ensure they're registered
-from src.parsers.docling_parser import DoclingParser
-from src.parsers.pypdfium_parser import PyPdfiumParser
 from src.parsers.gemini_flash_parser import GeminiFlashParser
 from src.parsers.got_ocr_parser import GotOcrParser

 """Parser implementations for document conversion."""
 # Import all parsers to ensure they're registered
 from src.parsers.gemini_flash_parser import GeminiFlashParser
 from src.parsers.got_ocr_parser import GotOcrParser

src/parsers/docling_parser.py DELETED Viewed

@@ -1,161 +0,0 @@
-from pathlib import Path
-from typing import Dict, List, Optional, Any, Union
-import json
-import os
-import shutil
-from src.parsers.parser_interface import DocumentParser
-from src.parsers.parser_registry import ParserRegistry
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
-    AcceleratorOptions,
-    PdfPipelineOptions,
-)
-from docling.models.tesseract_ocr_model import TesseractOcrOptions
-from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
-class DoclingParser(DocumentParser):
-    """Parser implementation using Docling."""
-    @classmethod
-    def get_name(cls) -> str:
-        return "Docling"
-    @classmethod
-    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
-        return [
-            {
-                "id": "no_ocr",
-                "name": "No OCR",
-                "default_params": {}
-            },
-            {
-                "id": "easyocr",
-                "name": "EasyOCR",
-                "default_params": {"languages": ["en"]}
-            },
-            {
-                "id": "easyocr_cpu",
-                "name": "EasyOCR (CPU only)",
-                "default_params": {"languages": ["en"], "use_gpu": False}
-            },
-            {
-                "id": "tesseract",
-                "name": "Tesseract",
-                "default_params": {}
-            },
-            {
-                "id": "tesseract_cli",
-                "name": "Tesseract CLI",
-                "default_params": {}
-            },
-            {
-                "id": "full_force_ocr",
-                "name": "Full Force OCR",
-                "default_params": {}
-            }
-        ]
-    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
-        """Parse a document using Docling."""
-        # Special case for full force OCR
-        if ocr_method == "full_force_ocr":
-            return self._apply_full_force_ocr(file_path)
-        # Regular Docling parsing
-        pipeline_options = PdfPipelineOptions()
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-        # Configure OCR based on the method
-        if ocr_method == "no_ocr":
-            pipeline_options.do_ocr = False
-        elif ocr_method == "easyocr":
-            pipeline_options.do_ocr = True
-            pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
-            pipeline_options.accelerator_options = AcceleratorOptions(
-                num_threads=4, device=AcceleratorDevice.AUTO
-            )
-        elif ocr_method == "easyocr_cpu":
-            pipeline_options.do_ocr = True
-            pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
-            pipeline_options.ocr_options.use_gpu = False
-        elif ocr_method == "tesseract":
-            pipeline_options.do_ocr = True
-            pipeline_options.ocr_options = TesseractOcrOptions()
-        elif ocr_method == "tesseract_cli":
-            pipeline_options.do_ocr = True
-            pipeline_options.ocr_options = TesseractCliOcrOptions()
-        # Create the converter
-        converter = DocumentConverter(
-            format_options={
-                InputFormat.PDF: PdfFormatOption(
-                    pipeline_options=pipeline_options
-                )
-            }
-        )
-        # Convert the document
-        result = converter.convert(Path(file_path))
-        doc = result.document
-        # Return the content in the requested format
-        output_format = kwargs.get("output_format", "markdown")
-        if output_format.lower() == "json":
-            return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
-        elif output_format.lower() == "text":
-            return doc.export_to_text()
-        elif output_format.lower() == "document_tags":
-            return doc.export_to_document_tokens()
-        else:
-            return doc.export_to_markdown()
-    def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
-        """Apply full force OCR to a document."""
-        input_doc = Path(file_path)
-        file_extension = input_doc.suffix.lower()
-        # Debug information
-        print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
-        # Basic pipeline setup
-        pipeline_options = PdfPipelineOptions()
-        pipeline_options.do_ocr = True
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-        # Find tesseract executable
-        tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
-        print(f"Using tesseract at: {tesseract_path}")
-        # Configure OCR options
-        ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)  # Using standard options instead of CLI
-        pipeline_options.ocr_options = ocr_options
-        # Set up format options based on file type
-        format_options = {
-            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
-        }
-        # Handle image files
-        if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
-            print(f"Processing as image file: {file_extension}")
-            format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
-        # Try full force OCR with standard options
-        try:
-            converter = DocumentConverter(format_options=format_options)
-            result = converter.convert(input_doc)
-            return result.document.export_to_markdown()
-        except Exception as e:
-            print(f"Error with standard OCR: {e}")
-            print(f"Attempting fallback to tesseract_cli OCR...")
-            return self.parse(file_path, ocr_method="tesseract_cli")
-# Register the parser with the registry
-ParserRegistry.register(DoclingParser)

src/parsers/pypdfium_parser.py DELETED Viewed

@@ -1,78 +0,0 @@
-from pathlib import Path
-from typing import Dict, List, Optional, Any, Union
-import json
-import pypdfium2 as pdfium
-from src.parsers.parser_interface import DocumentParser
-from src.parsers.parser_registry import ParserRegistry
-from docling.document_converter import DocumentConverter, PdfFormatOption
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-class PyPdfiumParser(DocumentParser):
-    """Parser implementation using PyPdfium."""
-    @classmethod
-    def get_name(cls) -> str:
-        return "PyPdfium"
-    @classmethod
-    def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
-        return [
-            {
-                "id": "no_ocr",
-                "name": "No OCR",
-                "default_params": {}
-            },
-            {
-                "id": "easyocr",
-                "name": "EasyOCR",
-                "default_params": {"languages": ["en"]}
-            }
-        ]
-    def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
-        """Parse a document using PyPdfium."""
-        pipeline_options = PdfPipelineOptions()
-        pipeline_options.do_table_structure = True
-        pipeline_options.table_structure_options.do_cell_matching = True
-        # Configure OCR based on the method
-        if ocr_method == "easyocr":
-            pipeline_options.do_ocr = True
-            # Apply any custom parameters from kwargs
-            if "languages" in kwargs:
-                pipeline_options.ocr_options.lang = kwargs["languages"]
-        else:
-            pipeline_options.do_ocr = False
-        # Create the converter
-        converter = DocumentConverter(
-            format_options={
-                InputFormat.PDF: PdfFormatOption(
-                    pipeline_options=pipeline_options,
-                    backend=PyPdfiumDocumentBackend
-                )
-            }
-        )
-        # Convert the document
-        result = converter.convert(Path(file_path))
-        doc = result.document
-        # Return the content in the requested format
-        output_format = kwargs.get("output_format", "markdown")
-        if output_format.lower() == "json":
-            return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
-        elif output_format.lower() == "text":
-            return doc.export_to_text()
-        elif output_format.lower() == "document_tags":
-            return doc.export_to_document_tokens()
-        else:
-            return doc.export_to_markdown()
-# Register the parser with the registry
-ParserRegistry.register(PyPdfiumParser)