AnseMin commited on
Commit
99c8f7d
·
1 Parent(s): 67baccc

made some changes on full force ocr clean up

Browse files
Files changed (1) hide show
  1. src/parsers/docling_parser.py +6 -40
src/parsers/docling_parser.py CHANGED
@@ -137,47 +137,21 @@ class DoclingParser(DocumentParser):
137
  pipeline_options.do_table_structure = True
138
  pipeline_options.table_structure_options.do_cell_matching = True
139
 
140
- # Find tesseract executable
141
- tesseract_cmd = None
142
- tesseract_paths = [
143
- "tesseract", # Default PATH
144
- "/usr/bin/tesseract", # Common Linux location
145
- "/app/tesseract/tesseract", # Possible custom location in Hugging Face
146
- "/opt/conda/bin/tesseract", # Possible Conda env in Hugging Face
147
- r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Windows location
148
- ]
149
-
150
- for path in tesseract_paths:
151
- if shutil.which(path) or (os.path.isfile(path) and os.access(path, os.X_OK)):
152
- tesseract_cmd = path
153
- print(f"Found tesseract at: {tesseract_cmd}")
154
- break
155
-
156
- if not tesseract_cmd:
157
- print("Warning: Tesseract executable not found. Using default configuration.")
158
- tesseract_cmd = "tesseract" # Use default as fallback
159
-
160
- # Configure OCR options with explicit tesseract path
161
- ocr_options = TesseractCliOcrOptions(
162
- force_full_page_ocr=True,
163
- tesseract_cmd=tesseract_cmd
164
- )
165
  pipeline_options.ocr_options = ocr_options
166
 
167
  # Set up format options for both PDF and image formats
168
  format_options = {}
169
 
170
- # Always include PDF format option
171
  format_options[InputFormat.PDF] = PdfFormatOption(
172
  pipeline_options=pipeline_options,
173
  )
174
 
175
- # For image files, we need to handle them differently
176
  if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
177
- # For image files, we'll use the same pipeline options
178
- # but we need to specify the input format as IMAGE
179
  print(f"Processing as image file: {file_extension}")
180
- # Note: InputFormat.IMAGE is used for image files in Docling
181
  format_options[InputFormat.IMAGE] = PdfFormatOption(
182
  pipeline_options=pipeline_options,
183
  )
@@ -191,17 +165,9 @@ class DoclingParser(DocumentParser):
191
  doc = result.document
192
  return doc.export_to_markdown()
193
  except Exception as e:
194
- # Provide detailed error information
195
- print(f"Error during full force OCR: {e}")
196
  print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
197
-
198
- # Try fallback to regular OCR if full force fails
199
- try:
200
- print("Attempting fallback to regular tesseract_cli OCR...")
201
- return self.parse(file_path, ocr_method="tesseract_cli")
202
- except Exception as fallback_error:
203
- print(f"Fallback OCR also failed: {fallback_error}")
204
- return f"OCR failed for {input_doc}. Error: {str(e)}"
205
 
206
 
207
  # Register the parser with the registry
 
137
  pipeline_options.do_table_structure = True
138
  pipeline_options.table_structure_options.do_cell_matching = True
139
 
140
+ # Configure OCR options - using TesseractCliOcrOptions directly without the text column issue
141
+ ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  pipeline_options.ocr_options = ocr_options
143
 
144
  # Set up format options for both PDF and image formats
145
  format_options = {}
146
 
147
+ # PDF format option
148
  format_options[InputFormat.PDF] = PdfFormatOption(
149
  pipeline_options=pipeline_options,
150
  )
151
 
152
+ # Handle image files
153
  if file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.tif', '.bmp']:
 
 
154
  print(f"Processing as image file: {file_extension}")
 
155
  format_options[InputFormat.IMAGE] = PdfFormatOption(
156
  pipeline_options=pipeline_options,
157
  )
 
165
  doc = result.document
166
  return doc.export_to_markdown()
167
  except Exception as e:
168
+ print(f"Error during OCR processing: {e}")
 
169
  print(f"File type: {file_extension}, File exists: {input_doc.exists()}")
170
+ return f"OCR failed for {input_doc}. Error: {str(e)}"
 
 
 
 
 
 
 
171
 
172
 
173
  # Register the parser with the registry