Spaces:

giovo17
/

ocr-my-pdf

Running

ocr-my-pdf / app.py

Giovanni Spadaro

init

30d6a12 29 days ago

1.51 kB

	import os
	import tempfile
	import gradio as gr
	import ocrmypdf


	def ocr_pdf(input_pdf_path, language):
	if input_pdf_path is None:
	return None

	try:
	input_base, input_ext = os.path.splitext(os.path.basename(input_pdf_path))

	with tempfile.NamedTemporaryFile(suffix=input_ext, prefix=f"{input_base}_ocr_", delete=False) as tmp_output:
	output_path = tmp_output.name
	ocrmypdf.ocr(
	input_pdf_path,
	output_path,
	deskew=True,
	clean=True,
	language=language,
	#unpaper=True,
	force_ocr=True
	)
	return output_path

	except Exception as e:
	return f"Error during OCR: {e}"
	finally:
	if isinstance(input_pdf_path, str) and os.path.exists(input_pdf_path) and "tmp" in input_pdf_path:
	try:
	os.remove(input_pdf_path)
	except OSError as e:
	print(f"Error deleting temporary input file: {e}")


	if __name__ == "__main__":
	app = gr.Interface(
	fn=ocr_pdf,
	inputs=[
	gr.File(label="Upload PDF to OCR"),
	gr.Dropdown(
	choices=["ita", "eng"],
	value="ita",
	label="OCR Language",
	),
	],
	outputs=gr.File(label="PDF with OCR"),
	title="OCR my PDF",
	description=("Add a text layer to your PDF file"),
	)
	app.launch()