Spaces:

GAS17
/

ocr

Running

ocr

File size: 590 Bytes

b16b8c7
 
990b2f4
b16b8c7
 
 
990b2f4
b16b8c7
 
 
 
 
 
 
 
 
 
990b2f4
b16b8c7

import pytesseract
from pdf2image import convert_from_path

def pdf_to_text(pdf_path, output_path):
    # Convert PDF to list of images
    pages = convert_from_path(pdf_path, 300)
    
    # Extract text from all pages and join them
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page)
    
    # Write text to file
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(text)
    
    print(f"OCR completed. Text saved to {output_path}")

# Usage
pdf_path = 'input.pdf'
output_path = 'output.txt'
pdf_to_text(pdf_path, output_path)