import pytesseract | |
from pdf2image import convert_from_path | |
def pdf_to_text(pdf_path, output_path): | |
# Convert PDF to list of images | |
pages = convert_from_path(pdf_path, 300) | |
# Extract text from all pages and join them | |
text = "" | |
for page in pages: | |
text += pytesseract.image_to_string(page) | |
# Write text to file | |
with open(output_path, "w", encoding="utf-8") as file: | |
file.write(text) | |
print(f"OCR completed. Text saved to {output_path}") | |
# Usage | |
pdf_path = 'input.pdf' | |
output_path = 'output.txt' | |
pdf_to_text(pdf_path, output_path) |