Didier's picture
Upload 3 files
8679e11 verified
raw
history blame
5.04 kB
"""
File: module_ocr.py
Description: Gradio module to interact the tesseract OCR code.
Author: Didier Guillevic
Date: 2024-11-23
"""
import gradio as gr
import os
import uuid
import shutil
import threading
import time
import pathlib
import ocr
import lang_codes
# Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
output_dir = "tmp_results"
os.makedirs(output_dir, exist_ok=True)
# Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
AGE_LIMIT = 3600
# Function to clean up old PDF files
def cleanup_old_files():
while True:
current_time = time.time()
for filename in os.listdir(output_dir):
file_path = os.path.join(output_dir, filename)
if filename.endswith(".pdf"):
# Check if the file is older than the age limit
file_age = current_time - os.path.getmtime(file_path)
if file_age > AGE_LIMIT:
print(f"Removing old file: {file_path}")
os.remove(file_path)
# Sleep for an hour before checking again
time.sleep(3600)
# Start the cleanup thread
cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
cleanup_thread.start()
#
# Process one file
#
def process(
input_file: str,
src_langs: list[str], # list of ISO 639-3 language codes
output_type: str
):
"""Process given file with OCR using given languages."
"""
# default result
output_text = ''
output_pdf = None
# format language as expected by tesseract package, e.g. 'eng+fra'
language = '+'.join(src_langs)
# PDF file or image file?
input_file_suffix = pathlib.Path(input_file).suffix.lower()
# output text?
if output_type in ['text', 'text+pdf']:
if input_file_suffix == '.pdf':
texts = ocr.pdf_scanner.pdf_to_text( # on text per page
pdf_path=input_file.name,
language=language
)
output_text = '\n\n'.join(texts)
else:
output_text = ocr.pdf_scanner.image_to_text(
image_path=input_file,
language=language,
psm=3
)
# output pdf?
if output_type in ['pdf', 'text+pdf']:
# Create a path for output PDF file
base_filename = os.path.basename(input_file)
base_filename, _ = os.path.splitext(base_filename)
output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
output_path = os.path.join(output_dir, output_path)
if input_file_suffix == '.pdf':
output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf_ocrmypdf(
pdf_path=input_file,
output_path=output_path,
language=language,
deskew=True,
optimize=True,
clean=False,
attempt_repair=True
)
else:
output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
image_path=input_file,
output_path=output_path,
language=language,
psm=3
)
return output_text, output_pdf
#
# User interface
#
with gr.Blocks() as demo:
# Upload file to process
with gr.Row():
input_file = gr.File(label="Upload a PDF file of a scanned document")
with gr.Column():
output_text = gr.Textbox(label="OCR output")
output_file = gr.File(label="Download OCR'ed PDF")
# Input: anguage(s) used in document, output types
with gr.Row():
src_langs = gr.Dropdown(
label='Language(s) of document',
choices=lang_codes.tesseract_lang_codes.items(),
multiselect=True,
value=['eng', 'fra'],
scale=4
)
output_type = gr.Dropdown(
label='Output type',
choices=['text', 'pdf', 'text+pdf'],
multiselect=False,
value='text',
scale=1
)
# Buttons
with gr.Row():
ocr_btn = gr.Button(value="OCR", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
# Examples
with gr.Accordion("Examples", open=False):
examples = gr.Examples(
[
['./pdfs/Non-text-searchable.pdf', ['eng','fra']],
['./sample_ID.jpeg', ['eng','fra']],
],
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file],
fn=process,
cache_examples=False,
label="Examples"
)
# Functions
ocr_btn.click(
fn=process,
inputs=[input_file, src_langs, output_type],
outputs=[output_text, output_file]
)
clear_btn.click(
fn=lambda : (None, '', None),
inputs=[],
outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
)
if __name__ == '__main__':
demo.launch()