Didier commited on
Commit
8679e11
·
verified ·
1 Parent(s): fb8a779

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. module_ocr.py +172 -0
  3. ocr.py +844 -0
  4. sample_ID.jpeg +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sample_ID.jpeg filter=lfs diff=lfs merge=lfs -text
module_ocr.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: module_ocr.py
3
+
4
+ Description: Gradio module to interact the tesseract OCR code.
5
+
6
+ Author: Didier Guillevic
7
+ Date: 2024-11-23
8
+ """
9
+
10
+ import gradio as gr
11
+ import os
12
+ import uuid
13
+ import shutil
14
+ import threading
15
+ import time
16
+ import pathlib
17
+
18
+ import ocr
19
+ import lang_codes
20
+
21
+
22
+ # Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
23
+ output_dir = "tmp_results"
24
+ os.makedirs(output_dir, exist_ok=True)
25
+
26
+ # Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
27
+ AGE_LIMIT = 3600
28
+
29
+ # Function to clean up old PDF files
30
+ def cleanup_old_files():
31
+ while True:
32
+ current_time = time.time()
33
+ for filename in os.listdir(output_dir):
34
+ file_path = os.path.join(output_dir, filename)
35
+ if filename.endswith(".pdf"):
36
+ # Check if the file is older than the age limit
37
+ file_age = current_time - os.path.getmtime(file_path)
38
+ if file_age > AGE_LIMIT:
39
+ print(f"Removing old file: {file_path}")
40
+ os.remove(file_path)
41
+ # Sleep for an hour before checking again
42
+ time.sleep(3600)
43
+
44
+ # Start the cleanup thread
45
+ cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
46
+ cleanup_thread.start()
47
+
48
+ #
49
+ # Process one file
50
+ #
51
+ def process(
52
+ input_file: str,
53
+ src_langs: list[str], # list of ISO 639-3 language codes
54
+ output_type: str
55
+ ):
56
+ """Process given file with OCR using given languages."
57
+ """
58
+ # default result
59
+ output_text = ''
60
+ output_pdf = None
61
+
62
+ # format language as expected by tesseract package, e.g. 'eng+fra'
63
+ language = '+'.join(src_langs)
64
+
65
+ # PDF file or image file?
66
+ input_file_suffix = pathlib.Path(input_file).suffix.lower()
67
+
68
+ # output text?
69
+ if output_type in ['text', 'text+pdf']:
70
+ if input_file_suffix == '.pdf':
71
+ texts = ocr.pdf_scanner.pdf_to_text( # on text per page
72
+ pdf_path=input_file.name,
73
+ language=language
74
+ )
75
+ output_text = '\n\n'.join(texts)
76
+ else:
77
+ output_text = ocr.pdf_scanner.image_to_text(
78
+ image_path=input_file,
79
+ language=language,
80
+ psm=3
81
+ )
82
+
83
+ # output pdf?
84
+ if output_type in ['pdf', 'text+pdf']:
85
+ # Create a path for output PDF file
86
+ base_filename = os.path.basename(input_file)
87
+ base_filename, _ = os.path.splitext(base_filename)
88
+ output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
89
+ output_path = os.path.join(output_dir, output_path)
90
+
91
+ if input_file_suffix == '.pdf':
92
+ output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf_ocrmypdf(
93
+ pdf_path=input_file,
94
+ output_path=output_path,
95
+ language=language,
96
+ deskew=True,
97
+ optimize=True,
98
+ clean=False,
99
+ attempt_repair=True
100
+ )
101
+ else:
102
+ output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
103
+ image_path=input_file,
104
+ output_path=output_path,
105
+ language=language,
106
+ psm=3
107
+ )
108
+
109
+ return output_text, output_pdf
110
+
111
+ #
112
+ # User interface
113
+ #
114
+ with gr.Blocks() as demo:
115
+
116
+ # Upload file to process
117
+ with gr.Row():
118
+ input_file = gr.File(label="Upload a PDF file of a scanned document")
119
+ with gr.Column():
120
+ output_text = gr.Textbox(label="OCR output")
121
+ output_file = gr.File(label="Download OCR'ed PDF")
122
+
123
+ # Input: anguage(s) used in document, output types
124
+ with gr.Row():
125
+ src_langs = gr.Dropdown(
126
+ label='Language(s) of document',
127
+ choices=lang_codes.tesseract_lang_codes.items(),
128
+ multiselect=True,
129
+ value=['eng', 'fra'],
130
+ scale=4
131
+ )
132
+ output_type = gr.Dropdown(
133
+ label='Output type',
134
+ choices=['text', 'pdf', 'text+pdf'],
135
+ multiselect=False,
136
+ value='text',
137
+ scale=1
138
+ )
139
+
140
+ # Buttons
141
+ with gr.Row():
142
+ ocr_btn = gr.Button(value="OCR", variant="primary")
143
+ clear_btn = gr.Button("Clear", variant="secondary")
144
+
145
+ # Examples
146
+ with gr.Accordion("Examples", open=False):
147
+ examples = gr.Examples(
148
+ [
149
+ ['./pdfs/Non-text-searchable.pdf', ['eng','fra']],
150
+ ['./sample_ID.jpeg', ['eng','fra']],
151
+ ],
152
+ inputs=[input_file, src_langs, output_type],
153
+ outputs=[output_text, output_file],
154
+ fn=process,
155
+ cache_examples=False,
156
+ label="Examples"
157
+ )
158
+
159
+ # Functions
160
+ ocr_btn.click(
161
+ fn=process,
162
+ inputs=[input_file, src_langs, output_type],
163
+ outputs=[output_text, output_file]
164
+ )
165
+ clear_btn.click(
166
+ fn=lambda : (None, '', None),
167
+ inputs=[],
168
+ outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
169
+ )
170
+
171
+ if __name__ == '__main__':
172
+ demo.launch()
ocr.py ADDED
@@ -0,0 +1,844 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: ocr.py
3
+
4
+ Description: (Traditional) Optical Character Recognition (OCR) using tesseract.
5
+
6
+ Author: Didier Guillevic
7
+ Date: 2024-11-23
8
+ """
9
+
10
+ import pytesseract
11
+ from pdf2image import convert_from_path
12
+ from pdf2image.exceptions import PDFPageCountError, PDFSyntaxError
13
+ import os
14
+ import uuid
15
+ import shutil
16
+ import logging
17
+ import pypdf
18
+ import subprocess
19
+ import ocrmypdf
20
+ from typing import List, Optional, Tuple, Union
21
+ from contextlib import contextmanager
22
+
23
+
24
+ tesseract_psm_modes = {
25
+ 0: "Orientation and script detection (OSD) only.",
26
+ 1: "Automatic page segmentation with OSD.",
27
+ 2: "Automatic page segmentation, but no OSD, or OCR.",
28
+ 3: "Fully automatic page segmentation, but no OSD. (**default**)",
29
+ 4: "Assume a single column of text of variable sizes.",
30
+ 5: "Assume a single uniform block of vertically aligned text.",
31
+ 6: "Assume a single uniform block of text.",
32
+ 7: "Treat the image as a single text line.",
33
+ 8: "Treat the image as a single word.",
34
+ 9: "Treat the image as a single word in a circle.",
35
+ 10: "Treat the image as a single character.",
36
+ 11: "Sparse text. Find as much text as possible in no particular order.",
37
+ 12: "Sparse text with OSD.",
38
+ 13: "Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific."
39
+ }
40
+
41
+ tesseract_psm_descriptions = {
42
+ "0: Orientation and script detection (OSD) only.": 0,
43
+ "1: Automatic page segmentation with OSD.": 1,
44
+ "2: Automatic page segmentation, but no OSD, or OCR.": 2,
45
+ "3: Fully automatic page segmentation, but no OSD. (**default**)": 3,
46
+ "4: Assume a single column of text of variable sizes.": 4,
47
+ "5: Assume a single uniform block of vertically aligned text.": 5,
48
+ "6: Assume a single uniform block of text.": 6,
49
+ "7: Treat the image as a single text line.": 7,
50
+ "8: Treat the image as a single word.": 8,
51
+ "9: Treat the image as a single word in a circle.": 9,
52
+ "10: Treat the image as a single character.": 10,
53
+ "11: Sparse text. Find as much text as possible in no particular order.": 11,
54
+ "12: Sparse text with OSD.": 12,
55
+ "13: Raw line. Treat the image as a single text line, bypassing hacks that are Tesseract-specific.": 13
56
+ }
57
+
58
+ class PDFScannerTempManager:
59
+ """
60
+ Manages temporary directory creation and cleanup for PDF scanning operations.
61
+ """
62
+
63
+ def __init__(self, base_temp_dir: str = 'tmp'):
64
+ """
65
+ Initialize temporary directory manager.
66
+
67
+ Args:
68
+ base_temp_dir (str): Base directory for temporary files
69
+ """
70
+ self.base_temp_dir = base_temp_dir
71
+ self.active_temp_dirs: list[str] = []
72
+
73
+ # Ensure base temporary directory exists
74
+ os.makedirs(base_temp_dir, exist_ok=True)
75
+
76
+ # Set up logging
77
+ logging.basicConfig(level=logging.INFO)
78
+ self.logger = logging.getLogger(__name__)
79
+
80
+ @contextmanager
81
+ def temp_directory(self) -> str:
82
+ """
83
+ Create a temporary directory with UUID and manage its lifecycle.
84
+
85
+ Yields:
86
+ str: Path to the temporary directory
87
+ """
88
+ # Generate unique directory name
89
+ dir_uuid = str(uuid.uuid4())
90
+ temp_dir = os.path.join(self.base_temp_dir, dir_uuid)
91
+
92
+ try:
93
+ # Create directory
94
+ os.makedirs(temp_dir, exist_ok=False)
95
+ self.active_temp_dirs.append(temp_dir)
96
+
97
+ # Yield directory path
98
+ yield temp_dir
99
+
100
+ finally:
101
+ # Remove directory and its contents
102
+ self._cleanup_directory(temp_dir)
103
+
104
+ def _cleanup_directory(self, directory: str) -> None:
105
+ """
106
+ Safely remove a temporary directory.
107
+
108
+ Args:
109
+ directory (str): Path to directory to remove
110
+ """
111
+ try:
112
+ if os.path.exists(directory):
113
+ shutil.rmtree(directory)
114
+
115
+ # Remove from active directories
116
+ if directory in self.active_temp_dirs:
117
+ self.active_temp_dirs.remove(directory)
118
+
119
+ except Exception as e:
120
+ self.logger.error(f"Error cleaning up directory {directory}: {e}")
121
+
122
+ def cleanup_all(self) -> None:
123
+ """
124
+ Clean up all temporary directories created during the session.
125
+ """
126
+ for directory in list(self.active_temp_dirs):
127
+ self._cleanup_directory(directory)
128
+
129
+ class PDFScanner:
130
+ """
131
+ A class to perform OCR on PDF files using Tesseract with robust temp management.
132
+ """
133
+
134
+ def __init__(self, tesseract_cmd: str = 'tesseract', dpi: int = 300,
135
+ temp_manager: Optional[PDFScannerTempManager] = None):
136
+ """
137
+ Initialize the PDFScanner.
138
+
139
+ Args:
140
+ tesseract_cmd (str): Path to tesseract executable
141
+ dpi (int): DPI for PDF conversion
142
+ temp_manager (PDFScannerTempManager, optional): Temp directory manager
143
+ """
144
+ self.dpi = dpi
145
+ self.temp_manager = temp_manager or PDFScannerTempManager()
146
+ pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
147
+
148
+ self.logger = logging.getLogger(__name__)
149
+
150
+ def _validate_pdf(self, pdf_path: str) -> Tuple[bool, str, bool]:
151
+ """
152
+ Validate PDF file and check for encryption.
153
+
154
+ Returns:
155
+ Tuple[bool, str, bool]: (is_valid, error_message, is_encrypted)
156
+ """
157
+ try:
158
+ with open(pdf_path, 'rb') as file:
159
+ # Check if file starts with PDF signature
160
+ if not file.read(4) == b'%PDF':
161
+ return False, "Not a valid PDF file (missing PDF signature)", False
162
+
163
+ # Reset file pointer
164
+ file.seek(0)
165
+
166
+ try:
167
+ pdf_reader = pypdf.PdfReader(file, strict=False)
168
+ is_encrypted = pdf_reader.is_encrypted
169
+
170
+ if is_encrypted:
171
+ return False, "PDF is encrypted and requires password", True
172
+
173
+ num_pages = len(pdf_reader.pages)
174
+ return True, f"Valid PDF with {num_pages} pages", False
175
+
176
+ except pypdf.errors.PdfReadError as e:
177
+ return False, f"Invalid PDF structure: {str(e)}", False
178
+
179
+ except Exception as e:
180
+ return False, f"Error validating PDF: {str(e)}", False
181
+
182
+ def _repair_pdf(self, pdf_path: str, temp_dir: str) -> str:
183
+ """
184
+ Attempt to repair a corrupted PDF file.
185
+
186
+ Args:
187
+ pdf_path (str): Path to original PDF
188
+ temp_dir (str): Temporary directory for repair
189
+
190
+ Returns:
191
+ str: Path to repaired PDF
192
+ """
193
+ repaired_pdf = os.path.join(temp_dir, 'repaired.pdf')
194
+
195
+ try:
196
+ # pypdf repair attempt
197
+ with open(pdf_path, 'rb') as file:
198
+ reader = pypdf.PdfReader(file, strict=False)
199
+ writer = pypdf.PdfWriter()
200
+
201
+ for page in reader.pages:
202
+ writer.add_page(page)
203
+
204
+ with open(repaired_pdf, 'wb') as output_file:
205
+ writer.write(output_file)
206
+
207
+ if os.path.exists(repaired_pdf):
208
+ return repaired_pdf
209
+
210
+ except Exception as e:
211
+ self.logger.warning(f"pypdf repair failed: {str(e)}")
212
+
213
+ # Ghostscript repair attempt
214
+ try:
215
+ gs_command = [
216
+ 'gs',
217
+ '-o', repaired_pdf,
218
+ '-sDEVICE=pdfwrite',
219
+ '-dPDFSETTINGS=/prepress',
220
+ pdf_path
221
+ ]
222
+
223
+ process = subprocess.run(
224
+ gs_command,
225
+ capture_output=True,
226
+ text=True
227
+ )
228
+
229
+ if process.returncode == 0 and os.path.exists(repaired_pdf):
230
+ return repaired_pdf
231
+ else:
232
+ raise Exception(f"Ghostscript repair failed: {process.stderr}")
233
+
234
+ except Exception as e:
235
+ self.logger.error(f"PDF repair failed: {str(e)}")
236
+ raise
237
+
238
+ def _process_images(
239
+ self,
240
+ images: list,
241
+ temp_dir: str,
242
+ language: str
243
+ ) -> list[str]:
244
+ """Helper method to process converted images."""
245
+ extracted_text = []
246
+ for i, image in enumerate(images):
247
+ image_path = os.path.join(temp_dir, f'page_{i+1}.png')
248
+ try:
249
+ # Save with higher quality
250
+ image.save(image_path, 'PNG', quality=100)
251
+
252
+ # Perform OCR with additional configuration
253
+ text = pytesseract.image_to_string(
254
+ image,
255
+ lang=language,
256
+ config='--psm 1 --oem 1'
257
+ )
258
+ extracted_text.append(text)
259
+
260
+ except Exception as e:
261
+ self.logger.error(f"Error processing page {i+1}: {str(e)}")
262
+ extracted_text.append(f"[ERROR ON PAGE {i+1}]")
263
+
264
+ return extracted_text
265
+
266
+ def pdf_to_text(
267
+ self,
268
+ pdf_path: str,
269
+ language: str = 'eng',
270
+ first_page: Optional[int] = None,
271
+ last_page: Optional[int] = None,
272
+ attempt_repair: bool = True
273
+ ) -> list[str]:
274
+ """
275
+ Convert a PDF file to text using OCR with robust error handling.
276
+
277
+ Args:
278
+ pdf_path (str): Path to the PDF file
279
+ language (str): Language for OCR (default: 'eng')
280
+ first_page (int, optional): First page to process (1-based)
281
+ last_page (int, optional): Last page to process
282
+ attempt_repair (bool): Whether to attempt repairing corrupted PDFs
283
+
284
+ Returns:
285
+ list[str]: List of extracted text for each page
286
+ """
287
+ if not os.path.exists(pdf_path):
288
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
289
+
290
+ # Use context manager for automatic cleanup
291
+ with self.temp_manager.temp_directory() as temp_dir:
292
+ # Validate PDF
293
+ is_valid, error_message, is_encrypted = self._validate_pdf(pdf_path)
294
+ if not is_valid:
295
+ self.logger.warning(f"PDF validation issue: {error_message}")
296
+
297
+ if is_encrypted:
298
+ raise Exception("Cannot process encrypted PDF files")
299
+
300
+ if attempt_repair:
301
+ try:
302
+ pdf_path = self._repair_pdf(pdf_path, temp_dir)
303
+ self.logger.info("Using repaired PDF file")
304
+ except Exception as e:
305
+ self.logger.error(f"Repair failed: {str(e)}")
306
+
307
+ # Conversion methods with increasing complexity
308
+ conversion_methods = [
309
+ {'use_pdftocairo': True, 'strict': False},
310
+ {'use_pdftocairo': False, 'strict': False},
311
+ {'use_pdftocairo': True, 'strict': False, 'dpi': self.dpi * 2},
312
+ {'use_pdftocairo': False, 'strict': False, 'dpi': self.dpi * 3}
313
+ ]
314
+
315
+ last_error = None
316
+ for method in conversion_methods:
317
+ try:
318
+ self.logger.info(f"Trying conversion method: {method}")
319
+ images = convert_from_path(
320
+ pdf_path,
321
+ dpi=method.get('dpi', self.dpi),
322
+ first_page=first_page,
323
+ last_page=last_page,
324
+ thread_count=4,
325
+ grayscale=True,
326
+ **{k: v for k, v in method.items() if k != 'dpi'}
327
+ )
328
+
329
+ if images:
330
+ return self._process_images(images, temp_dir, language)
331
+
332
+ except Exception as e:
333
+ last_error = e
334
+ self.logger.warning(f"Method failed: {str(e)}")
335
+ continue
336
+
337
+ if last_error:
338
+ raise Exception(f"All conversion methods failed. Last error: {str(last_error)}")
339
+
340
+ def pdf_to_searchable_pdf(self,
341
+ pdf_path: str,
342
+ output_path: str,
343
+ language: str = 'eng',
344
+ first_page: Optional[int] = None,
345
+ last_page: Optional[int] = None,
346
+ attempt_repair: bool = True) -> str:
347
+ """
348
+ Convert a scanned PDF file to a searchable PDF using Tesseract.
349
+
350
+ Args:
351
+ pdf_path (str): Path to the input PDF file
352
+ output_path (str): Path to save the searchable PDF
353
+ language (str): Language for OCR (default: 'eng')
354
+ first_page (int, optional): First page to process (1-based)
355
+ last_page (int, optional): Last page to process
356
+ attempt_repair (bool): Whether to attempt repairing corrupted PDFs
357
+
358
+ Returns:
359
+ str: Path to the output searchable PDF
360
+ """
361
+ if not os.path.exists(pdf_path):
362
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
363
+
364
+ # Use context manager for automatic cleanup
365
+ with self.temp_manager.temp_directory() as temp_dir:
366
+ # Validate PDF
367
+ is_valid, error_message, is_encrypted = self._validate_pdf(pdf_path)
368
+ if not is_valid:
369
+ self.logger.warning(f"PDF validation issue: {error_message}")
370
+
371
+ if is_encrypted:
372
+ raise Exception("Cannot process encrypted PDF files")
373
+
374
+ if attempt_repair:
375
+ try:
376
+ pdf_path = self._repair_pdf(pdf_path, temp_dir)
377
+ self.logger.info("Using repaired PDF file")
378
+ except Exception as e:
379
+ self.logger.error(f"Repair failed: {str(e)}")
380
+
381
+ # Process partial PDFs if requested
382
+ if first_page is not None or last_page is not None:
383
+ partial_pdf_path = os.path.join(temp_dir, 'partial.pdf')
384
+ with open(pdf_path, 'rb') as input_file:
385
+ reader = pypdf.PdfReader(input_file)
386
+ writer = pypdf.PdfWriter()
387
+
388
+ # Use 0-based indexing for pypdf
389
+ start_page = (first_page or 1) - 1
390
+ end_page = min(last_page or len(reader.pages), len(reader.pages))
391
+
392
+ for i in range(start_page, end_page):
393
+ writer.add_page(reader.pages[i])
394
+
395
+ with open(partial_pdf_path, 'wb') as output_file:
396
+ writer.write(output_file)
397
+
398
+ pdf_path = partial_pdf_path
399
+
400
+ # Extract images from the PDF
401
+ try:
402
+ images = convert_from_path(
403
+ pdf_path,
404
+ dpi=self.dpi,
405
+ thread_count=4,
406
+ grayscale=False
407
+ )
408
+ except Exception as e:
409
+ self.logger.error(f"Failed to convert PDF to images: {str(e)}")
410
+ raise
411
+
412
+ # Process each page individually
413
+ page_pdfs = []
414
+ for i, image in enumerate(images):
415
+ page_num = i + 1
416
+ image_path = os.path.join(temp_dir, f'page_{page_num}.png')
417
+ pdf_output = os.path.join(temp_dir, f'page_{page_num}')
418
+
419
+ try:
420
+ # Save the image
421
+ image.save(image_path, 'PNG', quality=100)
422
+
423
+ # Use Tesseract directly to create a searchable PDF
424
+ tesseract_cmd = [
425
+ pytesseract.pytesseract.tesseract_cmd,
426
+ image_path,
427
+ pdf_output,
428
+ '-l', language,
429
+ '--psm', '1',
430
+ 'pdf'
431
+ ]
432
+
433
+ process = subprocess.run(
434
+ tesseract_cmd,
435
+ capture_output=True,
436
+ text=True
437
+ )
438
+
439
+ if process.returncode != 0:
440
+ self.logger.error(f"Tesseract error on page {page_num}: {process.stderr}")
441
+ raise Exception(f"Tesseract failed on page {page_num}: {process.stderr}")
442
+
443
+ # Add the output PDF to our list
444
+ page_pdf_path = f'{pdf_output}.pdf'
445
+ if os.path.exists(page_pdf_path):
446
+ page_pdfs.append(page_pdf_path)
447
+ else:
448
+ raise FileNotFoundError(f"Expected output PDF not found: {page_pdf_path}")
449
+
450
+ except Exception as e:
451
+ self.logger.error(f"Error processing page {page_num}: {str(e)}")
452
+ raise
453
+
454
+ # Merge all page PDFs into a single file
455
+ if page_pdfs:
456
+ # Create a PDF writer
457
+ writer = pypdf.PdfWriter()
458
+ for pdf in page_pdfs:
459
+ reader = pypdf.PdfReader(pdf)
460
+ for page in reader.pages:
461
+ writer.add_page(page)
462
+
463
+ # Write to the output path
464
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
465
+ with open(output_path, "wb") as output_file:
466
+ writer.write(output_file)
467
+
468
+ self.logger.info(f"Created searchable PDF at {output_path}")
469
+ return output_path
470
+ else:
471
+ raise Exception("No pages were successfully processed")
472
+
473
+ def pdf_to_searchable_pdf_ocrmypdf(self,
474
+ pdf_path: str,
475
+ output_path: str,
476
+ language: str = 'eng',
477
+ first_page: Optional[int] = None,
478
+ last_page: Optional[int] = None,
479
+ deskew: bool = True,
480
+ optimize: bool = True,
481
+ clean: bool = False,
482
+ attempt_repair: bool = True) -> str:
483
+ """
484
+ Convert a scanned PDF file to a searchable PDF using ocrmypdf.
485
+
486
+ Args:
487
+ pdf_path (str): Path to the input PDF file
488
+ output_path (str): Path to save the searchable PDF
489
+ language (str): Language for OCR (default: 'eng')
490
+ first_page (int, optional): First page to process (1-based)
491
+ last_page (int, optional): Last page to process
492
+ deskew (bool): Whether to straighten pages
493
+ optimize (bool): Whether to optimize the PDF
494
+ clean (bool): Whether to clean the image before OCR
495
+ attempt_repair (bool): Whether to attempt repairing corrupted PDFs
496
+
497
+ Returns:
498
+ str: Path to the output searchable PDF
499
+ """
500
+ if not os.path.exists(pdf_path):
501
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
502
+
503
+ # Use context manager for automatic cleanup
504
+ with self.temp_manager.temp_directory() as temp_dir:
505
+ # Validate PDF
506
+ is_valid, error_message, is_encrypted = self._validate_pdf(pdf_path)
507
+ if not is_valid:
508
+ self.logger.warning(f"PDF validation issue: {error_message}")
509
+
510
+ if is_encrypted:
511
+ raise Exception("Cannot process encrypted PDF files")
512
+
513
+ if attempt_repair:
514
+ try:
515
+ pdf_path = self._repair_pdf(pdf_path, temp_dir)
516
+ self.logger.info("Using repaired PDF file")
517
+ except Exception as e:
518
+ self.logger.error(f"Repair failed: {str(e)}")
519
+
520
+ # Process partial PDFs if requested
521
+ working_pdf_path = pdf_path
522
+ if first_page is not None or last_page is not None:
523
+ partial_pdf_path = os.path.join(temp_dir, 'partial.pdf')
524
+ with open(pdf_path, 'rb') as input_file:
525
+ reader = pypdf.PdfReader(input_file)
526
+ writer = pypdf.PdfWriter()
527
+
528
+ # Use 0-based indexing for pypdf
529
+ start_page = (first_page or 1) - 1
530
+ end_page = min(last_page or len(reader.pages), len(reader.pages))
531
+
532
+ for i in range(start_page, end_page):
533
+ writer.add_page(reader.pages[i])
534
+
535
+ with open(partial_pdf_path, 'wb') as output_file:
536
+ writer.write(output_file)
537
+
538
+ working_pdf_path = partial_pdf_path
539
+
540
+ try:
541
+ # Ensure the output directory exists
542
+ output_dir = os.path.dirname(os.path.abspath(output_path))
543
+ os.makedirs(output_dir, exist_ok=True)
544
+
545
+ # ocrmypdf has a rich set of options
546
+ optimize_level = 1 if optimize else 0
547
+
548
+ # Run ocrmypdf
549
+ result = ocrmypdf.ocr(
550
+ working_pdf_path,
551
+ output_path,
552
+ language=language,
553
+ optimize=optimize_level,
554
+ skip_text=True, # Don't redo OCR on pages with text
555
+ deskew=deskew, # Straighten pages
556
+ clean=clean, # Clean pages before OCR
557
+ progress_bar=False,
558
+ use_threads=True,
559
+ jobs=os.cpu_count() or 4
560
+ )
561
+
562
+ if result == 0: # Success
563
+ self.logger.info(f"Created searchable PDF at {output_path}")
564
+ return output_path
565
+ else:
566
+ raise Exception(f"ocrmypdf returned non-zero exit code: {result}")
567
+
568
+ except Exception as e:
569
+ self.logger.error(f"Error creating searchable PDF with ocrmypdf: {str(e)}")
570
+ raise
571
+
572
+ def image_to_text(self,
573
+ image_path: str,
574
+ language: str = 'eng',
575
+ psm: int = 3
576
+ ) -> str:
577
+ """
578
+ Extract text from an image file using OCR.
579
+
580
+ Args:
581
+ image_path (str): Path to the image file
582
+ language (str): Language for OCR (default: 'eng')
583
+ psm (int): Page segmentation mode (default: 3)
584
+
585
+ Returns:
586
+ str: Extracted text from the image
587
+ """
588
+ if not os.path.exists(image_path):
589
+ raise FileNotFoundError(f"Image file not found: {image_path}")
590
+
591
+ try:
592
+ # Use Pillow to open the image
593
+ from PIL import Image
594
+ image = Image.open(image_path)
595
+
596
+ # Perform OCR with specified parameters
597
+ text = pytesseract.image_to_string(
598
+ image,
599
+ lang=language,
600
+ config=f'--psm {psm} --oem 1'
601
+ )
602
+
603
+ return text
604
+
605
+ except Exception as e:
606
+ self.logger.error(f"Error extracting text from image: {str(e)}")
607
+ raise
608
+
609
+ def image_to_searchable_pdf(self,
610
+ image_path: str,
611
+ output_path: str,
612
+ language: str = 'eng',
613
+ psm: int = 3
614
+ ) -> str:
615
+ """
616
+ Convert an image file to a searchable PDF with OCR text.
617
+
618
+ Args:
619
+ image_path (str): Path to the image file
620
+ output_path (str): Path to save the searchable PDF
621
+ language (str): Language for OCR (default: 'eng')
622
+ psm (int): Page segmentation mode (default: 3)
623
+
624
+ Returns:
625
+ str: Path to the output searchable PDF
626
+ """
627
+ if not os.path.exists(image_path):
628
+ raise FileNotFoundError(f"Image file not found: {image_path}")
629
+
630
+ # Use context manager for automatic cleanup
631
+ with self.temp_manager.temp_directory() as temp_dir:
632
+ try:
633
+ # Use Tesseract directly to create a searchable PDF
634
+ pdf_output = os.path.join(temp_dir, 'output')
635
+
636
+ tesseract_cmd = [
637
+ pytesseract.pytesseract.tesseract_cmd,
638
+ image_path,
639
+ pdf_output,
640
+ '-l', language,
641
+ '--psm', str(psm),
642
+ 'pdf'
643
+ ]
644
+
645
+ process = subprocess.run(
646
+ tesseract_cmd,
647
+ capture_output=True,
648
+ text=True
649
+ )
650
+
651
+ if process.returncode != 0:
652
+ self.logger.error(f"Tesseract error: {process.stderr}")
653
+ raise Exception(f"Tesseract failed: {process.stderr}")
654
+
655
+ # Check if the PDF was created
656
+ temp_pdf_path = f'{pdf_output}.pdf'
657
+ if not os.path.exists(temp_pdf_path):
658
+ raise FileNotFoundError(f"Expected output PDF not found: {temp_pdf_path}")
659
+
660
+ # Ensure output directory exists
661
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
662
+
663
+ # Copy the file to the desired output location
664
+ shutil.copy(temp_pdf_path, output_path)
665
+
666
+ self.logger.info(f"Created searchable PDF at {output_path}")
667
+ return output_path
668
+
669
+ except Exception as e:
670
+ self.logger.error(f"Error creating searchable PDF from image: {str(e)}")
671
+ raise
672
+
673
+ def images_to_searchable_pdf(self,
674
+ image_paths: List[str],
675
+ output_path: str,
676
+ language: str = 'eng',
677
+ psm: int = 3
678
+ ) -> str:
679
+ """
680
+ Convert multiple image files to a single searchable PDF with OCR text.
681
+
682
+ Args:
683
+ image_paths (List[str]): List of paths to image files
684
+ output_path (str): Path to save the searchable PDF
685
+ language (str): Language for OCR (default: 'eng')
686
+ psm (int): Page segmentation mode (default: 3)
687
+
688
+ Returns:
689
+ str: Path to the output searchable PDF
690
+ """
691
+ if not image_paths:
692
+ raise ValueError("No image paths provided")
693
+
694
+ # Use context manager for automatic cleanup
695
+ with self.temp_manager.temp_directory() as temp_dir:
696
+ try:
697
+ # Process each image separately
698
+ page_pdfs = []
699
+
700
+ for i, img_path in enumerate(image_paths):
701
+ if not os.path.exists(img_path):
702
+ raise FileNotFoundError(f"Image file not found: {img_path}")
703
+
704
+ # Create PDF for this image
705
+ pdf_output = os.path.join(temp_dir, f'page_{i+1}')
706
+
707
+ tesseract_cmd = [
708
+ pytesseract.pytesseract.tesseract_cmd,
709
+ img_path,
710
+ pdf_output,
711
+ '-l', language,
712
+ '--psm', str(psm),
713
+ 'pdf'
714
+ ]
715
+
716
+ process = subprocess.run(
717
+ tesseract_cmd,
718
+ capture_output=True,
719
+ text=True
720
+ )
721
+
722
+ if process.returncode != 0:
723
+ self.logger.error(f"Tesseract error on image {i+1}: {process.stderr}")
724
+ raise Exception(f"Tesseract failed on image {i+1}: {process.stderr}")
725
+
726
+ # Add the output PDF to our list
727
+ page_pdf_path = f'{pdf_output}.pdf'
728
+ if os.path.exists(page_pdf_path):
729
+ page_pdfs.append(page_pdf_path)
730
+ else:
731
+ raise FileNotFoundError(f"Expected output PDF not found: {page_pdf_path}")
732
+
733
+ # Merge all page PDFs into a single file
734
+ if page_pdfs:
735
+ # Create a PDF writer
736
+ writer = pypdf.PdfWriter()
737
+ for pdf in page_pdfs:
738
+ reader = pypdf.PdfReader(pdf)
739
+ for page in reader.pages:
740
+ writer.add_page(page)
741
+
742
+ # Write to the output path
743
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
744
+ with open(output_path, "wb") as output_file:
745
+ writer.write(output_file)
746
+
747
+ self.logger.info(f"Created searchable PDF at {output_path}")
748
+ return output_path
749
+ else:
750
+ raise Exception("No pages were successfully processed")
751
+
752
+ except Exception as e:
753
+ self.logger.error(f"Error creating searchable PDF from images: {str(e)}")
754
+ raise
755
+
756
+ #
757
+ # PDFScanner (singleton)
758
+ #
759
+ pdf_scanner = PDFScanner()
760
+
761
+
762
+ def main():
763
+ """
764
+ Example usage of the PDFScanner class.
765
+ """
766
+ pdf_file = "./pdfs/Non-text-searchable.pdf"
767
+ # Create a temp manager with custom base temp directory
768
+ temp_manager = PDFScannerTempManager(base_temp_dir='tmp')
769
+
770
+ try:
771
+ # Initialize scanner with temp manager
772
+ scanner = PDFScanner(temp_manager=temp_manager)
773
+
774
+ # Process PDF to extract text
775
+ print("Extracting text from PDF...")
776
+ results = scanner.pdf_to_text(
777
+ pdf_file,
778
+ attempt_repair=True
779
+ )
780
+
781
+ # Print extracted text results
782
+ for i, text in enumerate(results, 1):
783
+ print(f"\n=== Page {i} ===")
784
+ print(text)
785
+
786
+ # Create searchable PDF using Tesseract's direct PDF output
787
+ print("\nCreating searchable PDF using Tesseract...")
788
+ output_path = "searchable_output_tesseract.pdf"
789
+ scanner.pdf_to_searchable_pdf(
790
+ pdf_file,
791
+ output_path,
792
+ attempt_repair=True
793
+ )
794
+ print(f"Searchable PDF created at: {output_path}")
795
+
796
+ # Create searchable PDF using ocrmypdf
797
+ print("\nCreating searchable PDF using ocrmypdf...")
798
+ output_path_ocrmypdf = "searchable_output_ocrmypdf.pdf"
799
+ scanner.pdf_to_searchable_pdf_ocrmypdf(
800
+ pdf_file,
801
+ output_path_ocrmypdf,
802
+ deskew=True,
803
+ optimize=True,
804
+ clean=False,
805
+ attempt_repair=True
806
+ )
807
+ print(f"Searchable PDF (ocrmypdf method) created at: {output_path_ocrmypdf}")
808
+
809
+ # Extract text from a single image
810
+ image_file = "./images/sample.png"
811
+ print("Extracting text from image...")
812
+ text = scanner.image_to_text(image_file)
813
+ print("Extracted text:")
814
+ print(text)
815
+
816
+ # Create searchable PDF from a single image
817
+ print("\nCreating searchable PDF from image...")
818
+ output_path = "searchable_image.pdf"
819
+ scanner.image_to_searchable_pdf(image_file, output_path)
820
+ print(f"Searchable PDF created at: {output_path}")
821
+
822
+ # Create searchable PDF from multiple images
823
+ image_files = [
824
+ "./images/page1.png",
825
+ "./images/page2.jpg",
826
+ "./images/page3.tiff"
827
+ ]
828
+ print("\nCreating searchable PDF from multiple images...")
829
+ output_path_multi = "searchable_multiple_images.pdf"
830
+ scanner.images_to_searchable_pdf(image_files, output_path_multi)
831
+ print(f"Multi-page searchable PDF created at: {output_path_multi}")
832
+
833
+ except Exception as e:
834
+ print(f"Error: {str(e)}")
835
+
836
+ finally:
837
+ # Explicitly clean up all temp directories
838
+ temp_manager.cleanup_all()
839
+
840
+ if __name__ == "__main__":
841
+ main()
842
+
843
+
844
+
sample_ID.jpeg ADDED

Git LFS Details

  • SHA256: 61666983ee2654df2c51a2493acea1013955fc4836da2143f668cc077fdbe46a
  • Pointer size: 131 Bytes
  • Size of remote file: 202 kB