import gradio as gr from transformers import AutoProcessor, AutoModelForImageTextToText import torch import os device = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device) processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf") os.environ["PYTORCH_SDP_ATTENTION"] = "0" # Disable SDPA def extract_text_from_image(image): inputs = processor(image, return_tensors="pt").to(device) generate_ids = model.generate( **inputs, do_sample=False, tokenizer=processor.tokenizer, stop_strings="<|im_end|>", max_new_tokens=4096, ) return processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True) interface = gr.Interface( fn=extract_text_from_image, inputs=gr.Image(type="pil"), outputs=gr.Textbox(), title="OCR on Receipts", description="Upload an image to extract text using the GOT-OCR 2.0 model.", examples = [ ["images/250406_01.jpg"], ["images/250409_01.jpg"] ] ) interface.launch(share=False)