Spaces:

GAS17
/

ocr

Running

ocr / app.py

Update app.py

b16b8c7 verified 5 months ago

590 Bytes

	import pytesseract
	from pdf2image import convert_from_path

	def pdf_to_text(pdf_path, output_path):
	# Convert PDF to list of images
	pages = convert_from_path(pdf_path, 300)

	# Extract text from all pages and join them
	text = ""
	for page in pages:
	text += pytesseract.image_to_string(page)

	# Write text to file
	with open(output_path, "w", encoding="utf-8") as file:
	file.write(text)

	print(f"OCR completed. Text saved to {output_path}")

	# Usage
	pdf_path = 'input.pdf'
	output_path = 'output.txt'
	pdf_to_text(pdf_path, output_path)