import gradio as gr from transformers import TrOCRProcessor, VisionEncoderDecoderModel from pdf2image import convert_from_path import pytesseract # Load TrOCR Model from Hugging Face processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") # Function to extract text from PDF def extract_text_from_pdf(pdf_path): images = convert_from_path(pdf_path) extracted_text = [] for img in images: # Convert image to text using TrOCR pixel_values = processor(images=img, return_tensors="pt").pixel_values generated_ids = model.generate(pixel_values) text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # Fallback to Tesseract if TrOCR fails if not text.strip(): text = pytesseract.image_to_string(img) extracted_text.append(text) return "\n".join(extracted_text) # Gradio Interface def ocr_pipeline(pdf_file): pdf_path = pdf_file.name extracted_text = extract_text_from_pdf(pdf_path) return extracted_text iface = gr.Interface( fn=ocr_pipeline, inputs=gr.File(label="Upload PDF"), outputs="text", title="PDF Text Extraction using TrOCR" ) # Run the Gradio App if __name__ == "__main__": iface.launch()