import gradio as gr from PIL import Image import torch from torchvision import transforms from transformers import ( CLIPProcessor, CLIPModel, CLIPTokenizer, CLIPTextModelWithProjection, CLIPVisionModelWithProjection, CLIPFeatureExtractor, ) import math from typing import List from PIL import Image, ImageChops import numpy as np import torch from diffusers import UnCLIPPipeline # from diffusers.utils.torch_utils import randn_tensor from transformers import CLIPTokenizer from src.priors.prior_transformer import ( PriorTransformer, ) # original huggingface prior transformer without time conditioning from src.pipelines.pipeline_kandinsky_prior import KandinskyPriorPipeline from diffusers import DiffusionPipeline __DEVICE__ = "cpu" if torch.cuda.is_available(): __DEVICE__ = "cuda" class Ours: def __init__(self, device): text_encoder = ( CLIPTextModelWithProjection.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280, torch_dtype=torch.float32, ) .eval() .requires_grad_(False) ) tokenizer = CLIPTokenizer.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" ) prior = PriorTransformer.from_pretrained( "ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior", torch_dtype=torch.float32, ) self.pipe_prior = KandinskyPriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", prior=prior, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=torch.float32, ).to(device) self.pipe = DiffusionPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float32 ).to(device) def inference(self, text, negative_text, steps, guidance_scale): gen_images = [] for i in range(1): image_emb, negative_image_emb = self.pipe_prior( text, negative_prompt=negative_text ).to_tuple() image = self.pipe( image_embeds=image_emb, negative_image_embeds=negative_image_emb, num_inference_steps=steps, guidance_scale=guidance_scale, ).images gen_images.append(image[0]) return gen_images selected_model = Ours(device=__DEVICE__) def get_images(text, negative_text, steps, guidance_scale): images = selected_model.inference(text, negative_text, steps, guidance_scale) new_images = [] for img in images: new_images.append(img) return new_images[0] with gr.Blocks() as demo: gr.Markdown( """