Spaces:

dezzman
/

diffusion_models

Running

File size: 8,048 Bytes

import gradio as gr
import numpy as np
import random
import os

# import spaces #[uncomment to use ZeroGPU]
from diffusers import DiffusionPipeline, StableDiffusionPipeline
from peft import PeftModel, LoraConfig
import torch
from typing import Optional


def get_lora_sd_pipeline(
    ckpt_dir='./lora_logos', 
    base_model_name_or_path=None, 
    dtype=torch.float16, 
    adapter_name="default"
):
    unet_sub_dir = os.path.join(ckpt_dir, "unet")
    text_encoder_sub_dir = os.path.join(ckpt_dir, "text_encoder")
    if os.path.exists(text_encoder_sub_dir) and base_model_name_or_path is None:
        config = LoraConfig.from_pretrained(text_encoder_sub_dir)
        base_model_name_or_path = config.base_model_name_or_path

    if base_model_name_or_path is None:
        raise ValueError("Please specify the base model name or path")

    pipe = StableDiffusionPipeline.from_pretrained(base_model_name_or_path, torch_dtype=dtype)
    pipe.unet = PeftModel.from_pretrained(pipe.unet, unet_sub_dir, adapter_name=adapter_name)
    print(os.path.exists(unet_sub_dir))
    print(unet_sub_dir)
    print(dtype)

    if os.path.exists(text_encoder_sub_dir):
        pipe.text_encoder = PeftModel.from_pretrained(
            pipe.text_encoder, text_encoder_sub_dir, adapter_name=adapter_name
        )

    if dtype in (torch.float16, torch.bfloat16):
        pipe.unet.half()
        pipe.text_encoder.half()
    return pipe

def split_prompt(prompt, tokenizer, max_length=77):
    tokens = tokenizer(prompt, truncation=False)["input_ids"]
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

def get_prompt_embeds(prompt_chunks, text_encoder):
    prompt_embeds = []
    for chunk in prompt_chunks:
        chunk_tensor = torch.tensor([chunk]).to(text_encoder.device)
        with torch.no_grad():
            embeds = text_encoder(chunk_tensor)[0]
        prompt_embeds.append(embeds)
    return torch.cat(prompt_embeds, dim=1)

def shape_alignment(prompt_embeds, negative_prompt_embeds):
    max_length = max(prompt_embeds.shape[1], negative_prompt_embeds.shape[1])

    def pad_to_max_length(tensor, target_length):
        padding = target_length - tensor.shape[1]
        if padding > 0:
            pad_tensor = torch.zeros(
                tensor.shape[0], padding, tensor.shape[2], device=tensor.device
            )
            tensor = torch.cat([tensor, pad_tensor], dim=1)
        return tensor

    prompt_embeds = pad_to_max_length(prompt_embeds, max_length)
    negative_prompt_embeds = pad_to_max_length(negative_prompt_embeds, max_length)

    assert prompt_embeds.shape == negative_prompt_embeds.shape, "Shapes do not match!"
    return prompt_embeds, negative_prompt_embeds

def prompts_embeddings(prompt, negative_promt, tokenizer, text_encoder):
    prompt_chunks = split_prompt(prompt, tokenizer)
    negative_prompt_chunks = split_prompt(negative_prompt, tokenizer)

    prompt_embeds = get_prompt_embeds(prompt_chunks, text_encoder)
    negative_prompt_embeds = get_prompt_embeds(negative_prompt_chunks, text_encoder)

    prompt_embeds, negative_prompt_embeds = shape_alignment(prompt_embeds, negative_prompt_embeds)

    return prompt_embeds, negative_prompt_embeds


device = "cuda" if torch.cuda.is_available() else "cpu"
model_id_default = "CompVis/stable-diffusion-v1-4"

if torch.cuda.is_available():
    torch_dtype = torch.float16
else:
    torch_dtype = torch.float32


pipe_default = get_lora_sd_pipeline(
    ckpt_dir='./lora_logos', 
    base_model_name_or_path=model_id_default, 
    dtype=torch_dtype,
    )
# pipe_default = DiffusionPipeline.from_pretrained(model_id_default, torch_dtype=torch_dtype)
pipe_default = pipe_default.to(device)

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024


# @spaces.GPU #[uncomment to use ZeroGPU]
def infer(
    prompt: str,
    negative_prompt: str,
    width: int,
    height: int,
    num_inference_steps: Optional[int] = 20,
    model_id: Optional[str] = 'CompVis/stable-diffusion-v1-4',
    seed: Optional[int] = 42,
    guidance_scale: Optional[float] = 7.0,
    lora_scale: Optional[float] = 0.5,
    progress=gr.Progress(track_tqdm=True),
):
    generator = torch.Generator().manual_seed(seed)

    params = {
        # 'prompt': prompt,
        # 'negative_prompt': negative_prompt,
        'guidance_scale': guidance_scale,
        'num_inference_steps': num_inference_steps,
        'width': width,
        'height': height,
        'generator': generator,
    }

    if model_id != model_id_default:
        pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
        pipe = pipe.to(device)
        image = pipe(**params).images[0]
    else:
        print('----')
        print(lora_scale)
        print(prompt)
        print(negative_prompt)
        prompt_embeds, negative_prompt_embeds = prompts_embeddings(
            prompt, 
            negative_prompt, 
            pipe_default.tokenizer, 
            pipe_default.text_encoder
            )
        params['prompt_embeds'] = prompt_embeds
        params['negative_prompt_embeds']=negative_prompt_embeds
        pipe_default.fuse_lora(lora_scale=lora_scale)
        image = pipe_default(**params).images[0]

    return image

css = """
#col-container {
    margin: 0 auto;
    max-width: 640px;
}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        
        gr.Markdown(" # DEMO Text-to-Image")

        with gr.Row():
            model_id = gr.Textbox(
                label="Model ID",
                max_lines=1,
                placeholder="Enter model id like 'CompVis/stable-diffusion-v1-4'",
                value="CompVis/stable-diffusion-v1-4"
            )

        prompt = gr.Textbox(
            label="Prompt",
            max_lines=1,
            placeholder="Enter your prompt",
        )

        negative_prompt = gr.Textbox(
            label="Negative prompt",
            max_lines=1,
            placeholder="Enter a negative prompt",
        )

        with gr.Row():
            seed = gr.Number(
                label="Seed",
                minimum=0,
                maximum=MAX_SEED,
                step=1,
                value=42,
            )

        with gr.Row():
            guidance_scale = gr.Slider(
                label="Guidance scale",
                minimum=0.0,
                maximum=10.0,
                step=0.1,
                value=7.0,
            )

        with gr.Row():
            lora_scale = gr.Slider(
                label="LoRA scale",
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=0.5,
            )

        with gr.Row():
            num_inference_steps = gr.Slider(
                label="Number of inference steps",
                minimum=1,
                maximum=50,
                step=1,
                value=20,
            )

        with gr.Accordion("Optional Settings", open=False):
            with gr.Row():
                width = gr.Slider(
                    label="Width",
                    minimum=256,
                    maximum=MAX_IMAGE_SIZE,
                    step=32,
                    value=512,
                )
            
            with gr.Row():
                height = gr.Slider(
                    label="Height",
                    minimum=256,
                    maximum=MAX_IMAGE_SIZE,
                    step=32,
                    value=512,
                )

        run_button = gr.Button("Run", scale=1, variant="primary")
        result = gr.Image(label="Result", show_label=False)
    
    gr.on(
        triggers=[run_button.click, prompt.submit],
        fn=infer,
        inputs=[
            prompt,
            negative_prompt,
            width,
            height,
            num_inference_steps,
            model_id,
            seed,
            guidance_scale,
            lora_scale,
        ],
        outputs=[result],
    )

if __name__ == "__main__":
    demo.launch()