File size: 5,618 Bytes
8679e11 ff1ac9c 8679e11 2b3a14a 8679e11 a9c18a3 1b63fc6 2b3a14a 1b63fc6 a9c18a3 8679e11 5b41ca2 8679e11 34ee99f 8679e11 d250bbb 8679e11 2b3a14a 7b40055 8679e11 5b41ca2 8679e11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
"""
File: module_ocr.py
Description: Gradio module to interact the tesseract OCR code.
Author: Didier Guillevic
Date: 2024-11-23
"""
import gradio as gr
import os
import uuid
import shutil
import threading
import time
import pathlib
import ocr
import lang_codes
# Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
output_dir = "tmp_results"
os.makedirs(output_dir, exist_ok=True)
# Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
AGE_LIMIT = 3600
# Function to clean up old PDF files
def cleanup_old_files():
while True:
current_time = time.time()
for filename in os.listdir(output_dir):
file_path = os.path.join(output_dir, filename)
if filename.endswith(".pdf"):
# Check if the file is older than the age limit
file_age = current_time - os.path.getmtime(file_path)
if file_age > AGE_LIMIT:
print(f"Removing old file: {file_path}")
os.remove(file_path)
# Sleep for an hour before checking again
time.sleep(3600)
# Start the cleanup thread
cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
cleanup_thread.start()
#
# Process one file
#
def process(
input_file: str,
src_langs: list[str], # list of ISO 639-3 language codes
output_type: str
):
"""Process given file with OCR using given languages."
"""
# default result
output_text = ''
output_pdf = None
# format language as expected by tesseract package, e.g. 'eng+fra'
language = '+'.join(src_langs)
# PDF file or image file?
input_file_suffix = pathlib.Path(input_file).suffix.lower()
# output text?
if output_type in ['text', 'text+pdf']:
if input_file_suffix == '.pdf':
texts = ocr.pdf_scanner.pdf_to_text( # on text per page
pdf_path=input_file.name,
language=language
)
output_text = '\n\n'.join(texts)
else:
output_text = ocr.pdf_scanner.image_to_text(
image_path=input_file,
language=language,
psm=3
)
# output pdf?
if output_type in ['pdf', 'text+pdf']:
# Create a path for output PDF file
base_filename = os.path.basename(input_file)
base_filename, _ = os.path.splitext(base_filename)
output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
output_path = os.path.join(output_dir, output_path)
if input_file_suffix == '.pdf':
output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf(
pdf_path=input_file,
output_path=output_path,
language=language,
attempt_repair=True
)
else:
output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
image_path=input_file,
output_path=output_path,
language=language,
psm=3
)
return output_text, output_pdf
#
# User interface
#
with gr.Blocks() as demo:
def update_visibility(file):
return gr.update(visible=True) if file else gr.update(visible=False)
# Upload file to process
with gr.Row():
with gr.Column():
input_file = gr.File(
label="Upload an image or a PDF file of a scanned document",
height=160
)
output_file = gr.File(
label="Download OCR'ed PDF",
visible=False # Initially not visible
)
with gr.Column():
output_text = gr.Textbox(label="OCR output")
# Input: anguage(s) used in document, output types
with gr.Row():
src_langs = gr.Dropdown(
label='Language(s) of document',
choices=lang_codes.tesseract_lang_codes.items(),
multiselect=True,
value=['eng', 'fra'],
scale=4
)
output_type = gr.Dropdown(
label='Output type',
choices=['text', 'pdf', 'text+pdf'],
multiselect=False,
value='text+pdf',
scale=1
)
# Buttons
with gr.Row():
ocr_btn = gr.Button(value="OCR", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Examples
with gr.Accordion("Examples", open=False):
examples = gr.Examples(
[
['./Non-text-searchable.pdf', ['eng','fra']],
['./sample_ID.jpeg', ['eng','fra']],
],
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file],
fn=process,
cache_examples=False,
label="Examples"
)
# Documentation
with gr.Accordion("Documentation", open=False):
gr.Markdown(f"""
- Model: using the tesseract package for OCR 1.0 (traditional)
""")
# Functions
ocr_btn.click(
fn=process,
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file]
).then(
update_visibility,
inputs=output_file,
outputs=output_file
)
clear_btn.click(
fn=lambda : (None, '', None),
inputs=[],
outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
).then(
update_visibility,
inputs=output_file,
outputs=output_file
)
if __name__ == '__main__':
demo.launch()
|