Spaces:
Running
Running
import gradio as gr | |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
from pdf2image import convert_from_path | |
import pytesseract | |
# Load TrOCR Model from Hugging Face | |
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") | |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_path): | |
images = convert_from_path(pdf_path) | |
extracted_text = [] | |
for img in images: | |
# Convert image to text using TrOCR | |
pixel_values = processor(images=img, return_tensors="pt").pixel_values | |
generated_ids = model.generate(pixel_values) | |
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
# Fallback to Tesseract if TrOCR fails | |
if not text.strip(): | |
text = pytesseract.image_to_string(img) | |
extracted_text.append(text) | |
return "\n".join(extracted_text) | |
# Gradio Interface | |
def ocr_pipeline(pdf_file): | |
pdf_path = pdf_file.name | |
extracted_text = extract_text_from_pdf(pdf_path) | |
return extracted_text | |
iface = gr.Interface( | |
fn=ocr_pipeline, | |
inputs=gr.File(label="Upload PDF"), | |
outputs="text", | |
title="PDF Text Extraction using TrOCR" | |
) | |
# Run the Gradio App | |
if __name__ == "__main__": | |
iface.launch() | |