File size: 5,036 Bytes
8679e11 34ee99f 8679e11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
"""
File: module_ocr.py
Description: Gradio module to interact the tesseract OCR code.
Author: Didier Guillevic
Date: 2024-11-23
"""
import gradio as gr
import os
import uuid
import shutil
import threading
import time
import pathlib
import ocr
import lang_codes
# Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
output_dir = "tmp_results"
os.makedirs(output_dir, exist_ok=True)
# Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
AGE_LIMIT = 3600
# Function to clean up old PDF files
def cleanup_old_files():
while True:
current_time = time.time()
for filename in os.listdir(output_dir):
file_path = os.path.join(output_dir, filename)
if filename.endswith(".pdf"):
# Check if the file is older than the age limit
file_age = current_time - os.path.getmtime(file_path)
if file_age > AGE_LIMIT:
print(f"Removing old file: {file_path}")
os.remove(file_path)
# Sleep for an hour before checking again
time.sleep(3600)
# Start the cleanup thread
cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
cleanup_thread.start()
#
# Process one file
#
def process(
input_file: str,
src_langs: list[str], # list of ISO 639-3 language codes
output_type: str
):
"""Process given file with OCR using given languages."
"""
# default result
output_text = ''
output_pdf = None
# format language as expected by tesseract package, e.g. 'eng+fra'
language = '+'.join(src_langs)
# PDF file or image file?
input_file_suffix = pathlib.Path(input_file).suffix.lower()
# output text?
if output_type in ['text', 'text+pdf']:
if input_file_suffix == '.pdf':
texts = ocr.pdf_scanner.pdf_to_text( # on text per page
pdf_path=input_file.name,
language=language
)
output_text = '\n\n'.join(texts)
else:
output_text = ocr.pdf_scanner.image_to_text(
image_path=input_file,
language=language,
psm=3
)
# output pdf?
if output_type in ['pdf', 'text+pdf']:
# Create a path for output PDF file
base_filename = os.path.basename(input_file)
base_filename, _ = os.path.splitext(base_filename)
output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
output_path = os.path.join(output_dir, output_path)
if input_file_suffix == '.pdf':
output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf_ocrmypdf(
pdf_path=input_file,
output_path=output_path,
language=language,
deskew=True,
optimize=True,
clean=False,
attempt_repair=True
)
else:
output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
image_path=input_file,
output_path=output_path,
language=language,
psm=3
)
return output_text, output_pdf
#
# User interface
#
with gr.Blocks() as demo:
# Upload file to process
with gr.Row():
input_file = gr.File(label="Upload a PDF file of a scanned document")
with gr.Column():
output_text = gr.Textbox(label="OCR output")
output_file = gr.File(label="Download OCR'ed PDF")
# Input: anguage(s) used in document, output types
with gr.Row():
src_langs = gr.Dropdown(
label='Language(s) of document',
choices=lang_codes.tesseract_lang_codes.items(),
multiselect=True,
value=['eng', 'fra'],
scale=4
)
output_type = gr.Dropdown(
label='Output type',
choices=['text', 'pdf', 'text+pdf'],
multiselect=False,
value='text',
scale=1
)
# Buttons
with gr.Row():
ocr_btn = gr.Button(value="OCR", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Examples
with gr.Accordion("Examples", open=False):
examples = gr.Examples(
[
['./Non-text-searchable.pdf', ['eng','fra']],
['./sample_ID.jpeg', ['eng','fra']],
],
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file],
fn=process,
cache_examples=False,
label="Examples"
)
# Functions
ocr_btn.click(
fn=process,
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file]
)
clear_btn.click(
fn=lambda : (None, '', None),
inputs=[],
outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
)
if __name__ == '__main__':
demo.launch()
|