File size: 590 Bytes
b16b8c7 990b2f4 b16b8c7 990b2f4 b16b8c7 990b2f4 b16b8c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
import pytesseract
from pdf2image import convert_from_path
def pdf_to_text(pdf_path, output_path):
# Convert PDF to list of images
pages = convert_from_path(pdf_path, 300)
# Extract text from all pages and join them
text = ""
for page in pages:
text += pytesseract.image_to_string(page)
# Write text to file
with open(output_path, "w", encoding="utf-8") as file:
file.write(text)
print(f"OCR completed. Text saved to {output_path}")
# Usage
pdf_path = 'input.pdf'
output_path = 'output.txt'
pdf_to_text(pdf_path, output_path) |