import gradio as gr from transformers import TrOCRProcessor, VisionEncoderDecoderModel, pipeline from PIL import Image # Load OCR model processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-stage1') model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-stage1') # Load LLM llm = pipeline("text-generation", model="distilgpt2") def process_image(image): pil_image = Image.fromarray(image) pixel_values = processor(images=pil_image, return_tensors="pt").pixel_values generated_ids = model.generate(pixel_values) text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] llm_output = llm(text, max_length=100, do_sample=True)[0]["generated_text"] return f"OCR Text:\n{text}\n\nLLM Response:\n{llm_output}" gr.Interface(fn=process_image, inputs=gr.Image(type="numpy"), outputs="text", title="OCR + LLM Text Generator").launch()