Spaces:

Didier
/

Optical_character_recognition

Running

File size: 5,618 Bytes

"""
File: module_ocr.py

Description: Gradio module to interact the tesseract OCR code.

Author: Didier Guillevic
Date: 2024-11-23
"""

import gradio as gr
import os
import uuid
import shutil
import threading
import time
import pathlib

import ocr
import lang_codes


# Directory to save the (temporary) OCR'ed PDF files (whose path is returned to user)
output_dir = "tmp_results"
os.makedirs(output_dir, exist_ok=True)

# Define age limit for newly created files (in seconds, 24 hours = 86400 seconds)
AGE_LIMIT = 3600

# Function to clean up old PDF files
def cleanup_old_files():
    while True:
        current_time = time.time()
        for filename in os.listdir(output_dir):
            file_path = os.path.join(output_dir, filename)
            if filename.endswith(".pdf"):
                # Check if the file is older than the age limit
                file_age = current_time - os.path.getmtime(file_path)
                if file_age > AGE_LIMIT:
                    print(f"Removing old file: {file_path}")
                    os.remove(file_path)
        # Sleep for an hour before checking again
        time.sleep(3600)

# Start the cleanup thread
cleanup_thread = threading.Thread(target=cleanup_old_files, daemon=True)
cleanup_thread.start()

#
# Process one file
#
def process(
        input_file: str,
        src_langs: list[str], # list of ISO 639-3 language codes
        output_type: str
    ):
    """Process given file with OCR using given languages."
    """
    # default result
    output_text = ''
    output_pdf = None

    # format language as expected by tesseract package, e.g. 'eng+fra'
    language = '+'.join(src_langs)

    # PDF file or image file?
    input_file_suffix = pathlib.Path(input_file).suffix.lower()

    # output text?
    if output_type in ['text', 'text+pdf']:
        if input_file_suffix == '.pdf':
            texts = ocr.pdf_scanner.pdf_to_text( # on text per page
                pdf_path=input_file.name,
                language=language
            )
            output_text = '\n\n'.join(texts)
        else:
            output_text = ocr.pdf_scanner.image_to_text(
                image_path=input_file,
                language=language,
                psm=3
            )

    # output pdf?
    if output_type in ['pdf', 'text+pdf']:
        # Create a path for output PDF file
        base_filename = os.path.basename(input_file)
        base_filename, _ = os.path.splitext(base_filename)
        output_path = f"{base_filename}_OCR_{uuid.uuid4()}.pdf"
        output_path = os.path.join(output_dir, output_path)

        if input_file_suffix == '.pdf':
            output_pdf = ocr.pdf_scanner.pdf_to_searchable_pdf(
                pdf_path=input_file,
                output_path=output_path,
                language=language,
                attempt_repair=True
            )
        else:
            output_pdf = ocr.pdf_scanner.image_to_searchable_pdf(
                image_path=input_file,
                output_path=output_path,
                language=language,
                psm=3
            )
    
    return output_text, output_pdf

#
# User interface
#
with gr.Blocks() as demo:

    def update_visibility(file):
        return gr.update(visible=True) if file else gr.update(visible=False)
    
    # Upload file to process
    with gr.Row():
        with gr.Column():
            input_file = gr.File(
                label="Upload an image or a PDF file of a scanned document",
                height=160
            )
            output_file = gr.File(
                label="Download OCR'ed PDF",
                visible=False # Initially not visible
            )
        with gr.Column():
            output_text = gr.Textbox(label="OCR output")

    # Input: anguage(s) used in document, output types
    with gr.Row():
        src_langs = gr.Dropdown(
            label='Language(s) of document',
            choices=lang_codes.tesseract_lang_codes.items(),
            multiselect=True,
            value=['eng', 'fra'],
            scale=4
        )
        output_type = gr.Dropdown(
            label='Output type',
            choices=['text', 'pdf', 'text+pdf'],
            multiselect=False,
            value='text+pdf',
            scale=1
        )

    # Buttons
    with gr.Row():
        ocr_btn = gr.Button(value="OCR", variant="primary")
        clear_btn = gr.Button("Clear", variant="secondary")
    
    # Examples
    with gr.Accordion("Examples", open=False):
        examples = gr.Examples(
            [
                ['./Non-text-searchable.pdf', ['eng','fra']],
                ['./sample_ID.jpeg', ['eng','fra']],
            ],
            inputs=[input_file, src_langs, output_type],
            outputs=[output_text, output_file],
            fn=process,
            cache_examples=False,
            label="Examples"
        )

    # Documentation
    with gr.Accordion("Documentation", open=False):
        gr.Markdown(f"""
            - Model: using the tesseract package for OCR 1.0 (traditional)
        """)
    
    # Functions
    ocr_btn.click(
        fn=process,
        inputs=[input_file, src_langs, output_type],
        outputs=[output_text, output_file]
    ).then(
        update_visibility,
        inputs=output_file,
        outputs=output_file
    )
    clear_btn.click(
        fn=lambda : (None, '', None),
        inputs=[],
        outputs=[input_file, output_text, output_file] # input_file, output_text, output_file
    ).then(
        update_visibility,
        inputs=output_file,
        outputs=output_file
    )

if __name__ == '__main__':
    demo.launch()