AnseMin commited on
Commit
04b3509
Β·
1 Parent(s): c337a68

removing markers PDF for GOT OCR

Browse files
README.md CHANGED
@@ -30,7 +30,6 @@ Markit is a powerful tool that converts various document formats (PDF, DOCX, ima
30
  - **Advanced Parsing Engines**:
31
  - **PyPdfium**: Fast PDF parsing using the PDFium engine
32
  - **Docling**: Advanced document structure analysis
33
- - **Marker**: Specialized for markup and formatting
34
  - **Gemini Flash**: AI-powered conversion using Google's Gemini API
35
  - **GOT-OCR**: State-of-the-art OCR model for images (JPG/PNG only)
36
  - **OCR Integration**: Extract text from images and scanned documents using Tesseract OCR
@@ -121,9 +120,7 @@ build:
121
  2. Select a parser provider:
122
  - **PyPdfium**: Best for standard PDFs with selectable text
123
  - **Docling**: Best for complex document layouts
124
- - **Marker**: Best for preserving document formatting
125
  - **Gemini Flash**: Best for AI-powered conversions (requires API key)
126
- - **GOT-OCR**: Best for high-quality OCR on images (JPG/PNG only)
127
  3. Choose an OCR option based on your selected parser:
128
  - **None**: No OCR processing (for documents with selectable text)
129
  - **Tesseract**: Basic OCR using Tesseract
@@ -188,7 +185,6 @@ markit/
188
  β”‚ β”‚ β”œβ”€β”€ parser_interface.py # Parser interface
189
  β”‚ β”‚ β”œβ”€β”€ parser_registry.py # Parser registry
190
  β”‚ β”‚ β”œβ”€β”€ docling_parser.py # Docling parser
191
- β”‚ β”‚ β”œβ”€β”€ marker_parser.py # Marker parser
192
  β”‚ β”‚ └── pypdfium_parser.py # PyPDFium parser
193
  β”‚ β”œβ”€β”€ ui/ # User interface
194
  β”‚ β”‚ β”œβ”€β”€ __init__.py # Package initialization
 
30
  - **Advanced Parsing Engines**:
31
  - **PyPdfium**: Fast PDF parsing using the PDFium engine
32
  - **Docling**: Advanced document structure analysis
 
33
  - **Gemini Flash**: AI-powered conversion using Google's Gemini API
34
  - **GOT-OCR**: State-of-the-art OCR model for images (JPG/PNG only)
35
  - **OCR Integration**: Extract text from images and scanned documents using Tesseract OCR
 
120
  2. Select a parser provider:
121
  - **PyPdfium**: Best for standard PDFs with selectable text
122
  - **Docling**: Best for complex document layouts
 
123
  - **Gemini Flash**: Best for AI-powered conversions (requires API key)
 
124
  3. Choose an OCR option based on your selected parser:
125
  - **None**: No OCR processing (for documents with selectable text)
126
  - **Tesseract**: Basic OCR using Tesseract
 
185
  β”‚ β”‚ β”œβ”€β”€ parser_interface.py # Parser interface
186
  β”‚ β”‚ β”œβ”€β”€ parser_registry.py # Parser registry
187
  β”‚ β”‚ β”œβ”€β”€ docling_parser.py # Docling parser
 
188
  β”‚ β”‚ └── pypdfium_parser.py # PyPDFium parser
189
  β”‚ β”œβ”€β”€ ui/ # User interface
190
  β”‚ β”‚ β”œβ”€β”€ __init__.py # Package initialization
requirements.txt CHANGED
@@ -2,7 +2,6 @@ docling==2.25.0
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
5
- marker-pdf==1.3.5
6
  multiprocess==0.70.16
7
  pipdeptree==2.25.0
8
  pytesseract==0.3.13
 
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
 
5
  multiprocess==0.70.16
6
  pipdeptree==2.25.0
7
  pytesseract==0.3.13
src/parsers/__init__.py CHANGED
@@ -2,7 +2,6 @@
2
 
3
  # Import all parsers to ensure they're registered
4
  from src.parsers.docling_parser import DoclingParser
5
- from src.parsers.marker_parser import MarkerParser
6
  from src.parsers.pypdfium_parser import PyPdfiumParser
7
  from src.parsers.gemini_flash_parser import GeminiFlashParser
8
  from src.parsers.got_ocr_parser import GotOcrParser
 
2
 
3
  # Import all parsers to ensure they're registered
4
  from src.parsers.docling_parser import DoclingParser
 
5
  from src.parsers.pypdfium_parser import PyPdfiumParser
6
  from src.parsers.gemini_flash_parser import GeminiFlashParser
7
  from src.parsers.got_ocr_parser import GotOcrParser
src/parsers/marker_parser.py DELETED
@@ -1,61 +0,0 @@
1
- from pathlib import Path
2
- from typing import Dict, List, Optional, Any, Union
3
- import subprocess
4
- import tempfile
5
- import os
6
- import json
7
-
8
- from src.parsers.parser_interface import DocumentParser
9
- from src.parsers.parser_registry import ParserRegistry
10
- from marker.converters.pdf import PdfConverter
11
- from marker.models import create_model_dict
12
- from marker.output import text_from_rendered
13
-
14
-
15
- class MarkerParser(DocumentParser):
16
- """Parser implementation using Marker."""
17
-
18
- @classmethod
19
- def get_name(cls) -> str:
20
- return "Marker"
21
-
22
- @classmethod
23
- def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
24
- return [
25
- {
26
- "id": "no_ocr",
27
- "name": "No OCR",
28
- "default_params": {}
29
- },
30
- {
31
- "id": "force_ocr",
32
- "name": "Force OCR",
33
- "default_params": {}
34
- }
35
- ]
36
-
37
- def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
38
- """Parse a document using Marker."""
39
- force_ocr = ocr_method == "force_ocr"
40
-
41
- converter = PdfConverter(
42
- artifact_dict=create_model_dict(),
43
- config={"force_ocr": force_ocr}
44
- )
45
- rendered = converter(str(file_path))
46
- content, _, _ = text_from_rendered(rendered)
47
-
48
- # Format the content based on the requested output format
49
- output_format = kwargs.get("output_format", "markdown")
50
- if output_format.lower() == "json":
51
- return json.dumps({"content": content}, ensure_ascii=False, indent=2)
52
- elif output_format.lower() == "text":
53
- return content.replace("#", "").replace("*", "").replace("_", "")
54
- elif output_format.lower() == "document_tags":
55
- return f"<doc>\n{content}\n</doc>"
56
- else:
57
- return content
58
-
59
-
60
- # Register the parser with the registry
61
- ParserRegistry.register(MarkerParser)