Spaces:
Sleeping
Sleeping
made some changes on full force ocr clean up
Browse files
src/parsers/docling_parser.py
CHANGED
@@ -137,47 +137,21 @@ class DoclingParser(DocumentParser):
|
|
137 |
pipeline_options.do_table_structure = True
|
138 |
pipeline_options.table_structure_options.do_cell_matching = True
|
139 |
|
140 |
-
#
|
141 |
-
|
142 |
-
tesseract_paths = [
|
143 |
-
"tesseract", # Default PATH
|
144 |
-
"/usr/bin/tesseract", # Common Linux location
|
145 |
-
"/app/tesseract/tesseract", # Possible custom location in Hugging Face
|
146 |
-
"/opt/conda/bin/tesseract", # Possible Conda env in Hugging Face
|
147 |
-
r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Windows location
|
148 |
-
]
|
149 |
-
|
150 |
-
for path in tesseract_paths:
|
151 |
-
if shutil.which(path) or (os.path.isfile(path) and os.access(path, os.X_OK)):
|
152 |
-
tesseract_cmd = path
|
153 |
-
print(f"Found tesseract at: {tesseract_cmd}")
|
154 |
-
break
|
155 |
-
|
156 |
-
if not tesseract_cmd:
|
157 |
-
print("Warning: Tesseract executable not found. Using default configuration.")
|
158 |
-
tesseract_cmd = "tesseract" # Use default as fallback
|
159 |
-
|
160 |
-
# Configure OCR options with explicit tesseract path
|
161 |
-
ocr_options = TesseractCliOcrOptions(
|
162 |
-
force_full_page_ocr=True,
|
163 |
-
tesseract_cmd=tesseract_cmd
|
164 |
-
)
|
165 |
pipeline_options.ocr_options = ocr_options
|
166 |
|
167 |
# Set up format options for both PDF and image formats
|
168 |
format_options = {}
|
169 |
|
170 |
-
#
|
171 |
format_options[InputFormat.PDF] = PdfFormatOption(
|
172 |
pipeline_options=pipeline_options,
|
173 |
)
|
174 |
|
175 |
-
#
|
176 |
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
|
177 |
-
# For image files, we'll use the same pipeline options
|
178 |
-
# but we need to specify the input format as IMAGE
|
179 |
print(f"Processing as image file: {file_extension}")
|
180 |
-
# Note: InputFormat.IMAGE is used for image files in Docling
|
181 |
format_options[InputFormat.IMAGE] = PdfFormatOption(
|
182 |
pipeline_options=pipeline_options,
|
183 |
)
|
@@ -191,17 +165,9 @@ class DoclingParser(DocumentParser):
|
|
191 |
doc = result.document
|
192 |
return doc.export_to_markdown()
|
193 |
except Exception as e:
|
194 |
-
|
195 |
-
print(f"Error during full force OCR: {e}")
|
196 |
print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
|
197 |
-
|
198 |
-
# Try fallback to regular OCR if full force fails
|
199 |
-
try:
|
200 |
-
print("Attempting fallback to regular tesseract_cli OCR...")
|
201 |
-
return self.parse(file_path, ocr_method="tesseract_cli")
|
202 |
-
except Exception as fallback_error:
|
203 |
-
print(f"Fallback OCR also failed: {fallback_error}")
|
204 |
-
return f"OCR failed for {input_doc}. Error: {str(e)}"
|
205 |
|
206 |
|
207 |
# Register the parser with the registry
|
|
|
137 |
pipeline_options.do_table_structure = True
|
138 |
pipeline_options.table_structure_options.do_cell_matching = True
|
139 |
|
140 |
+
# Configure OCR options - using TesseractCliOcrOptions directly without the text column issue
|
141 |
+
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
pipeline_options.ocr_options = ocr_options
|
143 |
|
144 |
# Set up format options for both PDF and image formats
|
145 |
format_options = {}
|
146 |
|
147 |
+
# PDF format option
|
148 |
format_options[InputFormat.PDF] = PdfFormatOption(
|
149 |
pipeline_options=pipeline_options,
|
150 |
)
|
151 |
|
152 |
+
# Handle image files
|
153 |
if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
|
|
|
|
|
154 |
print(f"Processing as image file: {file_extension}")
|
|
|
155 |
format_options[InputFormat.IMAGE] = PdfFormatOption(
|
156 |
pipeline_options=pipeline_options,
|
157 |
)
|
|
|
165 |
doc = result.document
|
166 |
return doc.export_to_markdown()
|
167 |
except Exception as e:
|
168 |
+
print(f"Error during OCR processing: {e}")
|
|
|
169 |
print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
|
170 |
+
return f"OCR failed for {input_doc}. Error: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
|
173 |
# Register the parser with the registry
|