ocr / app.py
GAS17's picture
Update app.py
b16b8c7 verified
raw
history blame
590 Bytes
import pytesseract
from pdf2image import convert_from_path
def pdf_to_text(pdf_path, output_path):
# Convert PDF to list of images
pages = convert_from_path(pdf_path, 300)
# Extract text from all pages and join them
text = ""
for page in pages:
text += pytesseract.image_to_string(page)
# Write text to file
with open(output_path, "w", encoding="utf-8") as file:
file.write(text)
print(f"OCR completed. Text saved to {output_path}")
# Usage
pdf_path = 'input.pdf'
output_path = 'output.txt'
pdf_to_text(pdf_path, output_path)