AnseMin commited on
Commit
b3a5734
Β·
1 Parent(s): 9ddb112

remove docling and pypdfium because of independencies

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Markit v2
3
  emoji: πŸ“„
4
  colorFrom: blue
5
  colorTo: indigo
 
1
  ---
2
+ title: Markit GOT OCR
3
  emoji: πŸ“„
4
  colorFrom: blue
5
  colorTo: indigo
requirements.txt CHANGED
@@ -1,27 +1,29 @@
1
- docling==2.25.0
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
5
  multiprocess==0.70.16
6
  pipdeptree==2.25.0
7
- pytesseract==0.3.13
8
- semchunk==2.2.2
9
  Pillow>=9.0.0,<11.0.0
10
  numpy>=1.21.0
11
- # Tesseract dependencies
 
 
 
 
 
12
  tesseract==0.1.3
13
  tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
14
- # Additional dependencies for image processing
15
  opencv-python-headless>=4.5.0 # Headless version for server environments
16
- pdf2image>=1.16.0 # For PDF processing
 
17
  dill==0.3.8 # Downgraded to be compatible with datasets
18
- # Gemini API client
19
- google-genai>=0.1.0
20
- # Environment variables
21
  python-dotenv>=1.0.0
22
- # Pin pydantic to resolve compatibility issues with gradio
23
  pydantic==2.7.1
24
 
 
 
 
25
  # GOT-OCR dependencies
26
  torch>=2.0.1
27
  torchvision>=0.15.2
 
1
+ # Core dependencies
2
  gradio==5.14.0
3
  grpcio-status==1.70.0
4
  markdown==3.7
5
  multiprocess==0.70.16
6
  pipdeptree==2.25.0
 
 
7
  Pillow>=9.0.0,<11.0.0
8
  numpy>=1.21.0
9
+
10
+ # PDF processing
11
+ pdf2image>=1.16.0
12
+
13
+ # OCR dependencies (for GOT-OCR)
14
+ pytesseract==0.3.13
15
  tesseract==0.1.3
16
  tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
 
17
  opencv-python-headless>=4.5.0 # Headless version for server environments
18
+
19
+ # Utility dependencies
20
  dill==0.3.8 # Downgraded to be compatible with datasets
 
 
 
21
  python-dotenv>=1.0.0
 
22
  pydantic==2.7.1
23
 
24
+ # Gemini API client
25
+ google-genai>=0.1.0
26
+
27
  # GOT-OCR dependencies
28
  torch>=2.0.1
29
  torchvision>=0.15.2
src/core/converter.py CHANGED
@@ -8,7 +8,7 @@ from pathlib import Path
8
  from src.core.parser_factory import ParserFactory
9
 
10
  # Import all parsers to ensure they're registered
11
- import parsers
12
 
13
  # Reference to the cancellation flag from ui.py
14
  # This will be set by the UI when the cancel button is clicked
 
8
  from src.core.parser_factory import ParserFactory
9
 
10
  # Import all parsers to ensure they're registered
11
+ from src import parsers
12
 
13
  # Reference to the cancellation flag from ui.py
14
  # This will be set by the UI when the cancel button is clicked
src/parsers/__init__.py CHANGED
@@ -1,8 +1,6 @@
1
  """Parser implementations for document conversion."""
2
 
3
  # Import all parsers to ensure they're registered
4
- from src.parsers.docling_parser import DoclingParser
5
- from src.parsers.pypdfium_parser import PyPdfiumParser
6
  from src.parsers.gemini_flash_parser import GeminiFlashParser
7
  from src.parsers.got_ocr_parser import GotOcrParser
8
 
 
1
  """Parser implementations for document conversion."""
2
 
3
  # Import all parsers to ensure they're registered
 
 
4
  from src.parsers.gemini_flash_parser import GeminiFlashParser
5
  from src.parsers.got_ocr_parser import GotOcrParser
6
 
src/parsers/docling_parser.py DELETED
@@ -1,161 +0,0 @@
1
- from pathlib import Path
2
- from typing import Dict, List, Optional, Any, Union
3
- import json
4
- import os
5
- import shutil
6
-
7
- from src.parsers.parser_interface import DocumentParser
8
- from src.parsers.parser_registry import ParserRegistry
9
- from docling.document_converter import DocumentConverter, PdfFormatOption
10
- from docling.datamodel.base_models import InputFormat
11
- from docling.datamodel.pipeline_options import (
12
- AcceleratorDevice,
13
- AcceleratorOptions,
14
- PdfPipelineOptions,
15
- )
16
- from docling.models.tesseract_ocr_model import TesseractOcrOptions
17
- from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
18
-
19
-
20
- class DoclingParser(DocumentParser):
21
- """Parser implementation using Docling."""
22
-
23
- @classmethod
24
- def get_name(cls) -> str:
25
- return "Docling"
26
-
27
- @classmethod
28
- def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
29
- return [
30
- {
31
- "id": "no_ocr",
32
- "name": "No OCR",
33
- "default_params": {}
34
- },
35
- {
36
- "id": "easyocr",
37
- "name": "EasyOCR",
38
- "default_params": {"languages": ["en"]}
39
- },
40
- {
41
- "id": "easyocr_cpu",
42
- "name": "EasyOCR (CPU only)",
43
- "default_params": {"languages": ["en"], "use_gpu": False}
44
- },
45
- {
46
- "id": "tesseract",
47
- "name": "Tesseract",
48
- "default_params": {}
49
- },
50
- {
51
- "id": "tesseract_cli",
52
- "name": "Tesseract CLI",
53
- "default_params": {}
54
- },
55
- {
56
- "id": "full_force_ocr",
57
- "name": "Full Force OCR",
58
- "default_params": {}
59
- }
60
- ]
61
-
62
- def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
63
- """Parse a document using Docling."""
64
- # Special case for full force OCR
65
- if ocr_method == "full_force_ocr":
66
- return self._apply_full_force_ocr(file_path)
67
-
68
- # Regular Docling parsing
69
- pipeline_options = PdfPipelineOptions()
70
- pipeline_options.do_table_structure = True
71
- pipeline_options.table_structure_options.do_cell_matching = True
72
-
73
- # Configure OCR based on the method
74
- if ocr_method == "no_ocr":
75
- pipeline_options.do_ocr = False
76
- elif ocr_method == "easyocr":
77
- pipeline_options.do_ocr = True
78
- pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
79
- pipeline_options.accelerator_options = AcceleratorOptions(
80
- num_threads=4, device=AcceleratorDevice.AUTO
81
- )
82
- elif ocr_method == "easyocr_cpu":
83
- pipeline_options.do_ocr = True
84
- pipeline_options.ocr_options.lang = kwargs.get("languages", ["en"])
85
- pipeline_options.ocr_options.use_gpu = False
86
- elif ocr_method == "tesseract":
87
- pipeline_options.do_ocr = True
88
- pipeline_options.ocr_options = TesseractOcrOptions()
89
- elif ocr_method == "tesseract_cli":
90
- pipeline_options.do_ocr = True
91
- pipeline_options.ocr_options = TesseractCliOcrOptions()
92
-
93
- # Create the converter
94
- converter = DocumentConverter(
95
- format_options={
96
- InputFormat.PDF: PdfFormatOption(
97
- pipeline_options=pipeline_options
98
- )
99
- }
100
- )
101
-
102
- # Convert the document
103
- result = converter.convert(Path(file_path))
104
- doc = result.document
105
-
106
- # Return the content in the requested format
107
- output_format = kwargs.get("output_format", "markdown")
108
- if output_format.lower() == "json":
109
- return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
110
- elif output_format.lower() == "text":
111
- return doc.export_to_text()
112
- elif output_format.lower() == "document_tags":
113
- return doc.export_to_document_tokens()
114
- else:
115
- return doc.export_to_markdown()
116
-
117
- def _apply_full_force_ocr(self, file_path: Union[str, Path]) -> str:
118
- """Apply full force OCR to a document."""
119
- input_doc = Path(file_path)
120
- file_extension = input_doc.suffix.lower()
121
-
122
- # Debug information
123
- print(f"Applying full force OCR to file: {input_doc} (type: {file_extension})")
124
-
125
- # Basic pipeline setup
126
- pipeline_options = PdfPipelineOptions()
127
- pipeline_options.do_ocr = True
128
- pipeline_options.do_table_structure = True
129
- pipeline_options.table_structure_options.do_cell_matching = True
130
-
131
- # Find tesseract executable
132
- tesseract_path = shutil.which("tesseract") or "/usr/bin/tesseract"
133
- print(f"Using tesseract at: {tesseract_path}")
134
-
135
- # Configure OCR options
136
- ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) # Using standard options instead of CLI
137
- pipeline_options.ocr_options = ocr_options
138
-
139
- # Set up format options based on file type
140
- format_options = {
141
- InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
142
- }
143
-
144
- # Handle image files
145
- if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
146
- print(f"Processing as image file: {file_extension}")
147
- format_options[InputFormat.IMAGE] = PdfFormatOption(pipeline_options=pipeline_options)
148
-
149
- # Try full force OCR with standard options
150
- try:
151
- converter = DocumentConverter(format_options=format_options)
152
- result = converter.convert(input_doc)
153
- return result.document.export_to_markdown()
154
- except Exception as e:
155
- print(f"Error with standard OCR: {e}")
156
- print(f"Attempting fallback to tesseract_cli OCR...")
157
- return self.parse(file_path, ocr_method="tesseract_cli")
158
-
159
-
160
- # Register the parser with the registry
161
- ParserRegistry.register(DoclingParser)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/parsers/pypdfium_parser.py DELETED
@@ -1,78 +0,0 @@
1
- from pathlib import Path
2
- from typing import Dict, List, Optional, Any, Union
3
- import json
4
- import pypdfium2 as pdfium
5
-
6
- from src.parsers.parser_interface import DocumentParser
7
- from src.parsers.parser_registry import ParserRegistry
8
- from docling.document_converter import DocumentConverter, PdfFormatOption
9
- from docling.datamodel.base_models import InputFormat
10
- from docling.datamodel.pipeline_options import PdfPipelineOptions
11
- from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
12
-
13
-
14
- class PyPdfiumParser(DocumentParser):
15
- """Parser implementation using PyPdfium."""
16
-
17
- @classmethod
18
- def get_name(cls) -> str:
19
- return "PyPdfium"
20
-
21
- @classmethod
22
- def get_supported_ocr_methods(cls) -> List[Dict[str, Any]]:
23
- return [
24
- {
25
- "id": "no_ocr",
26
- "name": "No OCR",
27
- "default_params": {}
28
- },
29
- {
30
- "id": "easyocr",
31
- "name": "EasyOCR",
32
- "default_params": {"languages": ["en"]}
33
- }
34
- ]
35
-
36
- def parse(self, file_path: Union[str, Path], ocr_method: Optional[str] = None, **kwargs) -> str:
37
- """Parse a document using PyPdfium."""
38
- pipeline_options = PdfPipelineOptions()
39
- pipeline_options.do_table_structure = True
40
- pipeline_options.table_structure_options.do_cell_matching = True
41
-
42
- # Configure OCR based on the method
43
- if ocr_method == "easyocr":
44
- pipeline_options.do_ocr = True
45
- # Apply any custom parameters from kwargs
46
- if "languages" in kwargs:
47
- pipeline_options.ocr_options.lang = kwargs["languages"]
48
- else:
49
- pipeline_options.do_ocr = False
50
-
51
- # Create the converter
52
- converter = DocumentConverter(
53
- format_options={
54
- InputFormat.PDF: PdfFormatOption(
55
- pipeline_options=pipeline_options,
56
- backend=PyPdfiumDocumentBackend
57
- )
58
- }
59
- )
60
-
61
- # Convert the document
62
- result = converter.convert(Path(file_path))
63
- doc = result.document
64
-
65
- # Return the content in the requested format
66
- output_format = kwargs.get("output_format", "markdown")
67
- if output_format.lower() == "json":
68
- return json.dumps(doc.export_to_dict(), ensure_ascii=False, indent=2)
69
- elif output_format.lower() == "text":
70
- return doc.export_to_text()
71
- elif output_format.lower() == "document_tags":
72
- return doc.export_to_document_tokens()
73
- else:
74
- return doc.export_to_markdown()
75
-
76
-
77
- # Register the parser with the registry
78
- ParserRegistry.register(PyPdfiumParser)