import gradio as gr from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig import torch from PIL import Image # Custom configuration class class Florence2Config(AutoConfig): def to_dict(self): config_dict = super().to_dict() return config_dict # Load model and processor with trust_remote_code=True model = AutoModelForCausalLM.from_pretrained( "mynkchaudhry/Florence-2-FT-DocVQA", force_download=True, trust_remote_code=True ) processor = AutoProcessor.from_pretrained( "mynkchaudhry/Florence-2-FT-DocVQA", force_download=True, trust_remote_code=True ) def generate_response(image, question): try: if image.mode != "RGB": image = image.convert("RGB") inputs = processor(text=question, images=image, return_tensors="pt") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) inputs = {key: value.to(device) for key, value in inputs.items()} generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_length=1024, num_beams=3, early_stopping=True ) response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return response except Exception as e: return f"Error processing image: {e}" # Example images for demonstration (update paths as needed) examples = [ ["demo.png", "what is the address in the page?"], ["demo2.jpg", "what is the date in the page?"], ["demo.png", "what is the name in the page?"] ] # Gradio interface iface = gr.Interface( fn=generate_response, inputs=[gr.Image(type="pil"), gr.Textbox(label="Question")], outputs=gr.Textbox(label="Response"), examples=examples, title="Image to Text Extractor", description="Upload an image and provide a question. This tool will extract the relevant information from the image based on your question." ) # Launch the interface iface.launch()