Spaces:
Runtime error
Runtime error
removing markers PDF for GOT OCR
Browse files- README.md +0 -4
- requirements.txt +0 -1
- src/parsers/__init__.py +0 -1
- src/parsers/marker_parser.py +0 -61
README.md
CHANGED
@@ -30,7 +30,6 @@ Markit is a powerful tool that converts various document formats (PDF, DOCX, ima
|
|
30 |
- **Advanced Parsing Engines**:
|
31 |
- **PyPdfium**: Fast PDF parsing using the PDFium engine
|
32 |
- **Docling**: Advanced document structure analysis
|
33 |
-
- **Marker**: Specialized for markup and formatting
|
34 |
- **Gemini Flash**: AI-powered conversion using Google's Gemini API
|
35 |
- **GOT-OCR**: State-of-the-art OCR model for images (JPG/PNG only)
|
36 |
- **OCR Integration**: Extract text from images and scanned documents using Tesseract OCR
|
@@ -121,9 +120,7 @@ build:
|
|
121 |
2. Select a parser provider:
|
122 |
- **PyPdfium**: Best for standard PDFs with selectable text
|
123 |
- **Docling**: Best for complex document layouts
|
124 |
-
- **Marker**: Best for preserving document formatting
|
125 |
- **Gemini Flash**: Best for AI-powered conversions (requires API key)
|
126 |
-
- **GOT-OCR**: Best for high-quality OCR on images (JPG/PNG only)
|
127 |
3. Choose an OCR option based on your selected parser:
|
128 |
- **None**: No OCR processing (for documents with selectable text)
|
129 |
- **Tesseract**: Basic OCR using Tesseract
|
@@ -188,7 +185,6 @@ markit/
|
|
188 |
β β βββ parser_interface.py # Parser interface
|
189 |
β β βββ parser_registry.py # Parser registry
|
190 |
β β βββ docling_parser.py # Docling parser
|
191 |
-
β β βββ marker_parser.py # Marker parser
|
192 |
β β βββ pypdfium_parser.py # PyPDFium parser
|
193 |
β βββ ui/ # User interface
|
194 |
β β βββ __init__.py # Package initialization
|
|
|
30 |
- **Advanced Parsing Engines**:
|
31 |
- **PyPdfium**: Fast PDF parsing using the PDFium engine
|
32 |
- **Docling**: Advanced document structure analysis
|
|
|
33 |
- **Gemini Flash**: AI-powered conversion using Google's Gemini API
|
34 |
- **GOT-OCR**: State-of-the-art OCR model for images (JPG/PNG only)
|
35 |
- **OCR Integration**: Extract text from images and scanned documents using Tesseract OCR
|
|
|
120 |
2. Select a parser provider:
|
121 |
- **PyPdfium**: Best for standard PDFs with selectable text
|
122 |
- **Docling**: Best for complex document layouts
|
|
|
123 |
- **Gemini Flash**: Best for AI-powered conversions (requires API key)
|
|
|
124 |
3. Choose an OCR option based on your selected parser:
|
125 |
- **None**: No OCR processing (for documents with selectable text)
|
126 |
- **Tesseract**: Basic OCR using Tesseract
|
|
|
185 |
β β βββ parser_interface.py # Parser interface
|
186 |
β β βββ parser_registry.py # Parser registry
|
187 |
β β βββ docling_parser.py # Docling parser
|
|
|
188 |
β β βββ pypdfium_parser.py # PyPDFium parser
|
189 |
β βββ ui/ # User interface
|
190 |
β β βββ __init__.py # Package initialization
|
requirements.txt
CHANGED
@@ -2,7 +2,6 @@ docling==2.25.0
|
|
2 |
gradio==5.14.0
|
3 |
grpcio-status==1.70.0
|
4 |
markdown==3.7
|
5 |
-
marker-pdf==1.3.5
|
6 |
multiprocess==0.70.16
|
7 |
pipdeptree==2.25.0
|
8 |
pytesseract==0.3.13
|
|
|
2 |
gradio==5.14.0
|
3 |
grpcio-status==1.70.0
|
4 |
markdown==3.7
|
|
|
5 |
multiprocess==0.70.16
|
6 |
pipdeptree==2.25.0
|
7 |
pytesseract==0.3.13
|
src/parsers/__init__.py
CHANGED
@@ -2,7 +2,6 @@
|
|
2 |
|
3 |
# Import all parsers to ensure they're registered
|
4 |
from src.parsers.docling_parser import DoclingParser
|
5 |
-
from src.parsers.marker_parser import MarkerParser
|
6 |
from src.parsers.pypdfium_parser import PyPdfiumParser
|
7 |
from src.parsers.gemini_flash_parser import GeminiFlashParser
|
8 |
from src.parsers.got_ocr_parser import GotOcrParser
|
|
|
2 |
|
3 |
# Import all parsers to ensure they're registered
|
4 |
from src.parsers.docling_parser import DoclingParser
|
|
|
5 |
from src.parsers.pypdfium_parser import PyPdfiumParser
|
6 |
from src.parsers.gemini_flash_parser import GeminiFlashParser
|
7 |
from src.parsers.got_ocr_parser import GotOcrParser
|
src/parsers/marker_parser.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
from pathlib import Path
|
2 |
-
from typing import Dict, List, Optional, Any, Union
|
3 |
-
import subprocess
|
4 |
-
import tempfile
|
5 |
-
import os
|
6 |
-
import json
|
7 |
-
|
8 |
-
from src.parsers.parser_interface import DocumentParser
|
9 |
-
from src.parsers.parser_registry import ParserRegistry
|
10 |
-
from marker.converters.pdf import PdfConverter
|
11 |
-
from marker.models import create_model_dict
|
12 |
-
from marker.output import text_from_rendered
|
13 |
-
|
14 |
-
|
15 |
-
class MarkerParser(DocumentParser):
|
16 |
-
"""Parser implementation using Marker."""
|
17 |
-
|
18 |
-
@classmethod
|
19 |
-
def get_name(cls) -> str:
|
20 |
-
return "Marker"
|
21 |
-
|
22 |
-
@classmethod
|
23 |
-
def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
|
24 |
-
return [
|
25 |
-
{
|
26 |
-
"id": "no_ocr",
|
27 |
-
"name": "No OCR",
|
28 |
-
"default_params": {}
|
29 |
-
},
|
30 |
-
{
|
31 |
-
"id": "force_ocr",
|
32 |
-
"name": "Force OCR",
|
33 |
-
"default_params": {}
|
34 |
-
}
|
35 |
-
]
|
36 |
-
|
37 |
-
def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
|
38 |
-
"""Parse a document using Marker."""
|
39 |
-
force_ocr = ocr_method == "force_ocr"
|
40 |
-
|
41 |
-
converter = PdfConverter(
|
42 |
-
artifact_dict=create_model_dict(),
|
43 |
-
config={"force_ocr": force_ocr}
|
44 |
-
)
|
45 |
-
rendered = converter(str(file_path))
|
46 |
-
content, _, _ = text_from_rendered(rendered)
|
47 |
-
|
48 |
-
# Format the content based on the requested output format
|
49 |
-
output_format = kwargs.get("output_format", "markdown")
|
50 |
-
if output_format.lower() == "json":
|
51 |
-
return json.dumps({"content": content}, ensure_ascii=False, indent=2)
|
52 |
-
elif output_format.lower() == "text":
|
53 |
-
return content.replace("#", "").replace("*", "").replace("_", "")
|
54 |
-
elif output_format.lower() == "document_tags":
|
55 |
-
return f"<doc>\n{content}\n</doc>"
|
56 |
-
else:
|
57 |
-
return content
|
58 |
-
|
59 |
-
|
60 |
-
# Register the parser with the registry
|
61 |
-
ParserRegistry.register(MarkerParser)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|