import torch from PIL import Image import gradio as gr from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor from huggingface_hub import login import os login(token=os.environ.get("HF_TOKEN")) # Load model and processor model_id = "google/paligemma-3b-mix-224" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16) model.to(device) processor = PaliGemmaProcessor.from_pretrained(model_id) # Define the query function def query_image(image, prompt): if image is None or prompt.strip() == "": return "Please upload an image and enter a question." inputs = processor(text=prompt, images=image, padding="longest", do_convert_rgb=True, return_tensors="pt").to(device) inputs = inputs.to(dtype=model.dtype) with torch.no_grad(): output = model.generate(**inputs, max_length=496) return processor.decode(output[0], skip_special_tokens=True) # Gradio interface gr.Interface( fn=query_image, inputs=[ gr.Image(type="pil", label="Upload an Image"), gr.Textbox(label="Ask a question about the image", placeholder="e.g. What is shown in the image?") ], outputs=gr.Textbox(label="Answer"), title="🔍 Visual Question Answering", description="Upload an image and ask questions about it.", allow_flagging="never" ).launch()