import torch | |
import spaces # Import early to avoid potential issues | |
import gradio as gr | |
from transformers import CLIPProcessor, CLIPModel | |
# Load the CLIP model and processor on the CPU initially | |
model_name = "openai/clip-vit-base-patch32" | |
def clip_similarity(image, text): | |
# Load the model and processor inside GPU context | |
model = CLIPModel.from_pretrained(model_name) | |
processor = CLIPProcessor.from_pretrained(model_name) | |
device = torch.device("cuda") | |
model.to(device) | |
inputs = processor(text=[text], images=image, return_tensors="pt", padding=True) | |
inputs = {k: v.to(device) for k, v in inputs.items()} | |
outputs = model(**inputs) | |
similarity_score = outputs.logits_per_image.detach().cpu().numpy()[0] | |
return float(similarity_score) | |
# Set up the Gradio interface | |
iface = gr.Interface( | |
fn=clip_similarity, | |
inputs=[ | |
gr.Image(type="pil", label="Upload Image"), | |
gr.Text(label="Input Text") | |
], | |
outputs=gr.Number(label="Similarity Score"), | |
title="CLIP Similarity Demo with ZeroGPU" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |