Spaces:
Running
Running
import os | |
import tempfile | |
import gradio as gr | |
import ocrmypdf | |
def ocr_pdf(input_pdf_path, language): | |
if input_pdf_path is None: | |
return None | |
try: | |
input_base, input_ext = os.path.splitext(os.path.basename(input_pdf_path)) | |
with tempfile.NamedTemporaryFile(suffix=input_ext, prefix=f"{input_base}_ocr_", delete=False) as tmp_output: | |
output_path = tmp_output.name | |
ocrmypdf.ocr( | |
input_pdf_path, | |
output_path, | |
deskew=True, | |
clean=True, | |
language=language, | |
#unpaper=True, | |
force_ocr=True | |
) | |
return output_path | |
except Exception as e: | |
return f"Error during OCR: {e}" | |
finally: | |
if isinstance(input_pdf_path, str) and os.path.exists(input_pdf_path) and "tmp" in input_pdf_path: | |
try: | |
os.remove(input_pdf_path) | |
except OSError as e: | |
print(f"Error deleting temporary input file: {e}") | |
if __name__ == "__main__": | |
app = gr.Interface( | |
fn=ocr_pdf, | |
inputs=[ | |
gr.File(label="Upload PDF to OCR"), | |
gr.Dropdown( | |
choices=["ita", "eng"], | |
value="ita", | |
label="OCR Language", | |
), | |
], | |
outputs=gr.File(label="PDF with OCR"), | |
title="OCR my PDF", | |
description=("Add a text layer to your PDF file"), | |
) | |
app.launch() |