File size: 1,506 Bytes
30d6a12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import tempfile
import gradio as gr
import ocrmypdf


def ocr_pdf(input_pdf_path, language):
    if input_pdf_path is None:
        return None

    try:
        input_base, input_ext = os.path.splitext(os.path.basename(input_pdf_path))
        
        with tempfile.NamedTemporaryFile(suffix=input_ext, prefix=f"{input_base}_ocr_", delete=False) as tmp_output:
            output_path = tmp_output.name
            ocrmypdf.ocr(
                input_pdf_path, 
                output_path, 
                deskew=True,
                clean=True, 
                language=language, 
                #unpaper=True, 
                force_ocr=True 
            )
            return output_path

    except Exception as e:
        return f"Error during OCR: {e}"
    finally:
        if isinstance(input_pdf_path, str) and os.path.exists(input_pdf_path) and "tmp" in input_pdf_path:
            try:
                os.remove(input_pdf_path)
            except OSError as e:
                print(f"Error deleting temporary input file: {e}")


if __name__ == "__main__":
    app = gr.Interface(
        fn=ocr_pdf,
        inputs=[
            gr.File(label="Upload PDF to OCR"),
            gr.Dropdown(
                choices=["ita", "eng"],
                value="ita",
                label="OCR Language",
            ),
        ],
        outputs=gr.File(label="PDF with OCR"),
        title="OCR my PDF",
        description=("Add a text layer to your PDF file"),
    )
    app.launch()